various updates

2026-01-06 09:29:33 -06:00 · 2024-08-10 13:07:49 +12:00
parent ff4ad4d257
commit b7d7568cc4
11 changed files with 430 additions and 242 deletions
--- a/Web/FlowElements/Downloader.cs
+++ b/Web/FlowElements/Downloader.cs
@@ -1,5 +1,6 @@
 using System.Net;
 using System.Text.RegularExpressions;
+using FileFlows.Web.Helpers;

 namespace FileFlows.Web.FlowElements;

@@ -35,8 +36,6 @@ public class Downloader : Node
    [TextVariable(1)]
    public string Url { get; set; } = null!;

-    private static HttpClient? client;
-
    /// <inheritdoc />
    public override int Execute(NodeParameters args)
    {
@@ -48,7 +47,7 @@ public class Downloader : Node
            return -1;
        }

-        var result = Download(args.Logger!, url, args.TempPath, (percent) =>
+        var result = DownloadHelper.Download(args.Logger!, url, args.TempPath, (percent) =>
            {
                args.PartPercentageUpdate?.Invoke(percent);
            });
@@ -64,196 +63,4 @@ public class Downloader : Node

        return 1;
    }
-    
-    /// <summary>
-    /// Performs the download
-    /// </summary>
-    /// <param name="logger">the logger to use</param>
-    /// <param name="url">the URL to download</param>
-    /// <param name="destinationPath">the destination path</param>
-    /// <param name="percentUpdate">the percent update</param>
-    /// <returns>the name of the file if successful, otherwise an error</returns>
-     public Result<string> Download(ILogger logger, string url, string destinationPath, Action<float> percentUpdate)
-    {
-        if (client == null)
-        {
-
-            var handler = new HttpClientHandler
-            {
-                AllowAutoRedirect = true,
-                AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate
-            };
-            client = new HttpClient(handler);
-            client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
-        }
-
-
-        try
-        {
-            var tempFile = Path.Combine(destinationPath, Guid.NewGuid().ToString());
-
-            using (var response = client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead).Result)
-            {
-                if (!response.IsSuccessStatusCode)
-                {
-                    return Result<string>.Fail($"Failed to download URL: {url}. Status code: {response.StatusCode}");
-                }
-
-                var contentType = response.Content.Headers.ContentType?.MediaType;
-                if(string.IsNullOrEmpty(contentType) == false)
-                    logger?.ILog("ContentType: " + contentType);
-                var fileExtension = GetFileExtensionFromContentType(contentType);
-
-                // Check if the URL response contains a filename
-                if (response.Content.Headers.ContentDisposition?.FileName != null)
-                {
-                    var sanitizedFileName = SanitizeFileName(response.Content.Headers.ContentDisposition.FileName.Trim('"'));
-                    tempFile = Path.Combine(destinationPath, sanitizedFileName);
-                }
-                else
-                {
-                    if (fileExtension == null)
-                    {
-                        // Check for common file headers if the content type is not recognized
-                        var buffer = new byte[512];
-                        using (var contentStream = response.Content.ReadAsStreamAsync().Result)
-                        {
-                            contentStream.Read(buffer, 0, buffer.Length);
-                            fileExtension = GetFileExtensionFromHeader(buffer) ?? ".html";
-                            contentStream.Position = 0; // Reset stream position for reading again
-                        }
-                    }
-
-                    tempFile += fileExtension;
-                }
-
-                using (var contentStream = response.Content.ReadAsStreamAsync().Result)
-                using (var fileStream = new FileStream(tempFile, FileMode.Create, FileAccess.Write, FileShare.None))
-                {
-                    var totalBytes = response.Content.Headers.ContentLength ?? -1L;
-                    var totalRead = 0L;
-                    var buffer = new byte[8192];
-                    var isMoreToRead = true;
-
-                    while (isMoreToRead)
-                    {
-                        var read = contentStream.ReadAsync(buffer, 0, buffer.Length).Result;
-                        if (read == 0)
-                        {
-                            isMoreToRead = false;
-                            continue;
-                        }
-
-                        fileStream.WriteAsync(buffer, 0, read).Wait();
-                        totalRead += read;
-
-                        if (totalBytes != -1)
-                        {
-                            var progress = (float)totalRead / totalBytes;
-                            percentUpdate?.Invoke(progress);
-                        }
-                    }
-                }
-            }
-
-            logger?.ILog($"Downloaded file saved to: {tempFile}");
-            return tempFile;
-        }
-        catch (Exception ex)
-        {
-            return Result<string>.Fail($"Exception during download: {ex.Message}");
-        }
-    }
-
-
-    /// <summary>
-    /// Gets the file extension from the content type.
-    /// </summary>
-    /// <param name="contentType">The content type.</param>
-    /// <returns>The corresponding file extension, or null if not recognized.</returns>
-    private string? GetFileExtensionFromContentType(string? contentType)
-    {
-        switch (contentType)
-        {
-            case "text/html": return ".html";
-            case "image/jpeg": return ".jpg";
-            case "image/png": return ".png";
-            case "image/gif": return ".gif";
-            case "application/pdf": return ".pdf";
-            case "application/zip": return ".zip";
-            case "application/json": return ".json";
-            case "text/plain": return ".txt";
-            case "audio/mpeg": return ".mp3";
-            case "video/mp4": return ".mp4";
-            case "application/vnd.ms-excel": return ".xls";
-            case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": return ".xlsx";
-            case "application/msword": return ".doc";
-            case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return ".docx";
-            case "application/vnd.ms-powerpoint": return ".ppt";
-            case "application/vnd.openxmlformats-officedocument.presentationml.presentation": return ".pptx";
-            case "application/x-rar-compressed": return ".rar";
-            case "application/x-tar": return ".tar";
-            case "application/x-7z-compressed": return ".7z";
-            // Add more content types and their corresponding file extensions as needed
-            default: return null;
-        }
-    }
-
-    /// <summary>
-    /// Gets the file extension from the file header bytes.
-    /// </summary>
-    /// <param name="fileHeader">The first few bytes of the file to identify its type.</param>
-    /// <returns>The corresponding file extension, or null if not recognized.</returns>
-    private string? GetFileExtensionFromHeader(byte[] fileHeader)
-    {
-        // Implement logic to identify file types based on header bytes
-        // Example: Check for common file signatures
-        if (fileHeader.Length >= 4)
-        {
-            // PDF file signature
-            if (fileHeader[0] == 0x25 && fileHeader[1] == 0x50 && fileHeader[2] == 0x44 && fileHeader[3] == 0x46)
-            {
-                return ".pdf";
-            }
-
-            // ZIP file signature
-            if (fileHeader[0] == 0x50 && fileHeader[1] == 0x4B &&
-                (fileHeader[2] == 0x03 || fileHeader[2] == 0x05 || fileHeader[2] == 0x07) && fileHeader[3] == 0x08)
-            {
-                return ".zip";
-            }
-
-            // PNG file signature
-            if (fileHeader[0] == 0x89 && fileHeader[1] == 0x50 && fileHeader[2] == 0x4E && fileHeader[3] == 0x47)
-            {
-                return ".png";
-            }
-
-            // JPEG file signature
-            if (fileHeader[0] == 0xFF && fileHeader[1] == 0xD8 && fileHeader[fileHeader.Length - 2] == 0xFF &&
-                fileHeader[fileHeader.Length - 1] == 0xD9)
-            {
-                return ".jpg";
-            }
-        }
-
-        return null;
-    }
-    
-
-    /// <summary>
-    /// Sanitizes the filename to ensure it does not contain any path traversal characters or invalid characters.
-    /// </summary>
-    /// <param name="fileName">The filename to sanitize.</param>
-    /// <returns>The sanitized filename.</returns>
-    private string SanitizeFileName(string fileName)
-    {
-        // Remove any path traversal characters
-        fileName = Regex.Replace(fileName, @"\.\.\/|\\|\.\.\\|\/", string.Empty);
-
-        // Only allow safe characters in the filename
-        fileName = Regex.Replace(fileName, @"[^a-zA-Z0-9_\-\.]", "_");
-
-        return fileName;
-    }
 }
--- a/Web/FlowElements/InputUrl.cs
+++ b/Web/FlowElements/InputUrl.cs
@@ -0,0 +1,51 @@
+using FileFlows.Web.Helpers;
+
+namespace FileFlows.Web.FlowElements;
+
+/// <summary>
+/// Input for a URL
+/// </summary>
+public class InputUrl : Node
+{
+    /// <inheritdoc />
+    public override int Outputs => 1;
+    /// <inheritdoc />
+    public override FlowElementType Type => FlowElementType.Input;
+    /// <inheritdoc />
+    public override string Icon => "fas fa-globe";
+    /// <inheritdoc />
+    public override string HelpUrl => "https://fileflows.com/docs/plugins/web/input-url";
+    /// <inheritdoc />
+    public override string Group => "Web";
+
+    /// <summary>
+    /// Gets or sets if this should download the URL
+    /// </summary>
+    [Boolean(1)]
+    public bool Download { get; set; }
+
+    /// <inheritdoc />
+    public override int Execute(NodeParameters args)
+    {
+        string url = args.WorkingFile;
+        args.Variables["Url"] = url;
+        if (Download == false)
+            return 1;
+        
+        var result = DownloadHelper.Download(args.Logger!, url, args.TempPath, (percent) =>
+        {
+            args.PartPercentageUpdate?.Invoke(percent);
+        });
+        
+        if(result.Failed(out var error))
+        {
+            args.FailureReason = error;
+            args.Logger?.ELog(error);
+            return -1;
+        }
+
+        args.SetWorkingFile(result.Value);
+        
+        return 1;
+    }
+}
--- a/Web/FlowElements/Parsers/HtmlImageParser.cs
+++ b/Web/FlowElements/Parsers/HtmlImageParser.cs
@@ -1,5 +1,6 @@
 using System.Net;
 using System.Text.RegularExpressions;
+using HtmlAgilityPack;

 namespace FileFlows.Web.FlowElements;

@@ -33,7 +34,10 @@ public class HtmlImageParser : HtmlParser
    protected override string VariableName => "ImageUrls";

    /// <inheritdoc />
-    protected override List<string> ParseHtml(ILogger? logger, string html)
+    protected override List<string> ParseHtml(NodeParameters args, string html)
+        => ParseHtmlForUrls(args, html, ["img"], ["src", "content"]);
+
+    private List<string> ParseHtmlOld(ILogger? logger, string html)
    {
        var imageUrls = new List<string>();
        var regex = new Regex("<img[^>]+src=(\"([^\"]*)\"|'([^']*)'|([^\\s>]+))", RegexOptions.IgnoreCase);
--- a/Web/FlowElements/Parsers/HtmlLinkParser.cs
+++ b/Web/FlowElements/Parsers/HtmlLinkParser.cs
@@ -33,18 +33,53 @@ public class HtmlLinkParser : HtmlParser
    protected override string VariableName => "Links";

    /// <inheritdoc />
-    protected override List<string> ParseHtml(ILogger? logger, string html)
+    protected override List<string> ParseHtml(NodeParameters args, string html)
+        => ParseHtmlForUrls(args, html, ["a"], ["href"]);
+
+    private List<string> ParseHtmlOld(ILogger? logger, string html)
    {
        var urls = new List<string>();
        var regex = new Regex("<a[^>]+href=(\"([^\"]*)\"|'([^']*)'|([^\\s>]+))", RegexOptions.IgnoreCase);
        var matches = regex.Matches(html);

+        string? baseUrl = null;
+        if (Variables.TryGetValue("Url", out var oUrl) && oUrl is string sBaseUrl)
+        {
+            try
+            {
+                var uri = new Uri(sBaseUrl);
+
+                // Get the absolute path without the query parameters
+                baseUrl = uri.GetLeftPart(UriPartial.Path);
+
+                // Ensure the path ends with a slash
+                if (baseUrl.EndsWith("/") == false)
+                    baseUrl += "/";
+
+                // Use the folderPath as needed
+                logger?.ILog("Base URL: " + baseUrl);
+            }
+            catch (Exception)
+            {
+                // Ignored
+            }
+        }
+        
        foreach (Match match in matches)
        {
            if (match.Groups.Count > 1)
            {
                var url = match.Groups[1].Value.TrimStart('"', '\'').TrimEnd('"', '\'');
-                urls.Add(WebUtility.HtmlDecode(url));
+                url = WebUtility.HtmlDecode(url);
+                if (baseUrl != null && Regex.IsMatch(url, "^http(s)://", RegexOptions.IgnoreCase) == false)
+                {
+                    logger?.ILog("Relative URL: " + url);
+                    if (url.StartsWith("/"))
+                        url = url[1..];
+                    url = baseUrl + url;
+                    logger?.ILog("Absolute URL: " + url);
+                }
+                urls.Add(url);
            }
        }
        
--- a/Web/FlowElements/Parsers/HtmlParser.cs
+++ b/Web/FlowElements/Parsers/HtmlParser.cs
@@ -1,5 +1,6 @@
 using System.Net;
 using System.Text.RegularExpressions;
+using HtmlAgilityPack;

 namespace FileFlows.Web.FlowElements;

@@ -50,7 +51,7 @@ public abstract class HtmlParser : Node

        var html = result.Value;

-        var list = ParseHtml(args.Logger, html);
+        var list = ParseHtml(args, html);

        var pattern = args.ReplaceVariables(Pattern ?? string.Empty, stripMissing: true);
        if (string.IsNullOrWhiteSpace(pattern) == false)
@@ -87,7 +88,8 @@ public abstract class HtmlParser : Node
            args.Logger?.ILog("Found item: " + item);
        }

-        args.Variables[VariableName] = list;
+        if(string.IsNullOrWhiteSpace(VariableName) == false)
+            args.Variables[VariableName] = list;
        // current list is the default current list FileFLows will use in a list flow element if no list is specified
        args.Variables["CurrentList"] = list;
        
@@ -97,10 +99,10 @@ public abstract class HtmlParser : Node
    /// <summary>
    /// Parses the HTML
    /// </summary>
-    /// <param name="logger">the logger to use</param>
+    /// <param name="args">the node parameters</param>
    /// <param name="html">the HTML to parse</param>
    /// <returns>the items found while pasrsing</returns>
-    protected abstract List<string> ParseHtml(ILogger? logger, string html);
+    protected abstract List<string> ParseHtml(NodeParameters args, string html);

    /// <summary>
    /// Gets the file content
@@ -138,4 +140,60 @@ public abstract class HtmlParser : Node

        return File.ReadAllText(localFileResult.Value);
    }
+    
+    
+    /// <summary>
+    /// Parses the HTML for the specified tags and attributes
+    /// </summary>
+    /// <param name="args">the node parameters</param>
+    /// <param name="html">the HTML to parse</param>
+    /// <param name="tags">the HTML tags to look for</param>
+    /// <param name="attributes">the attributes to look for</param>
+    /// <returns>a list of matching URLs</returns>
+    protected List<string> ParseHtmlForUrls(NodeParameters args, string html, string[] tags, string[] attributes)
+    {
+        var htmlDoc = new HtmlDocument();
+        htmlDoc.LoadHtml(html);
+
+        Uri? baseUri = null;
+        if (args.Variables.TryGetValue("Url", out var oUrl) && oUrl is string sBaseUrl)
+        {
+            baseUri = new Uri(sBaseUrl);
+            args.Logger?.ILog("Base URL: " + baseUri);
+        }
+
+        List<string> results = new();
+
+
+        foreach (var tag in tags)
+        {
+            var nodes = htmlDoc.DocumentNode.SelectNodes($"//{tag}");
+            if (nodes == null) continue;
+
+            foreach (var ele in nodes)
+            {
+                foreach (var att in attributes)
+                {
+                    var srcValue = ele.GetAttributeValue(att, string.Empty);
+                    if (!string.IsNullOrEmpty(srcValue))
+                    {
+                        if (srcValue.StartsWith("http", StringComparison.OrdinalIgnoreCase))
+                        {
+                            results.Add(srcValue);
+                        }
+                        else if (baseUri != null)
+                        {
+                            if (Uri.TryCreate(srcValue, UriKind.Relative, out var relativeSrcUri))
+                            {
+                                var absoluteSrcUri = new Uri(baseUri, relativeSrcUri);
+                                results.Add(absoluteSrcUri.ToString());
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return results;
+    }
 }
--- a/Web/FlowElements/UrlToPath.cs
+++ b/Web/FlowElements/UrlToPath.cs
@@ -1,39 +0,0 @@
-namespace FileFlows.Web.FlowElements;
-
-/// <summary>
-/// 
-/// </summary>
-public class UrlToRelativePath : Node
-{
-    public override int Inputs => 1;
-    public override int Outputs => 2;
-    public override string Icon => 
-
-    /// <summary>
-    /// Gets or sets the URL to get a path for
-    /// </summary>
-    [TextVariable(1)]
-    public string Url { get; set; } = null!;
-    
-    public override int Execute(NodeParameters args)
-    {
-        
-        // Create a Uri object from the URL
-        var uri = new Uri(x);
-
-        // Get the path without the query
-        var path = uri.AbsolutePath;
-
-        // Get the query part and replace the '=' and '&' with '-'
-        var query = uri.Query.TrimStart('?').Replace('=', '-').Replace('&', '/');
-
-        // Combine the path and the modified query
-        var fakePath = path.TrimEnd('/') + (string.IsNullOrEmpty(query) ? string.Empty : "/" + query);
-
-        // Remove leading slash
-        if (fakePath.StartsWith("/"))
-        {
-            fakePath = fakePath.Substring(1);
-        }
-    }
-}
--- a/Web/FlowElements/WebRequest.cs
+++ b/Web/FlowElements/WebRequest.cs
@@ -1,4 +1,4 @@
-namespace FileFlows.Web;
+namespace FileFlows.Web.FlowElements;

 using FileFlows.Plugin;
 using FileFlows.Plugin.Attributes;
@@ -22,6 +22,8 @@ public class WebRequest : Node
    public override string Icon => "fas fa-globe";
    /// <inheritdoc />
    public override string HelpUrl => "https://fileflows.com/docs/plugins/web/web-request";
+    /// <inheritdoc />
+    public override string Group => "Web";

    /// <summary>
    /// Gets or sets the URL