various updates

This commit is contained in:
John Andrews
2024-08-10 13:07:49 +12:00
parent ff4ad4d257
commit b7d7568cc4
11 changed files with 430 additions and 242 deletions

View File

@@ -1,5 +1,6 @@
using System.Net;
using System.Text.RegularExpressions;
using FileFlows.Web.Helpers;
namespace FileFlows.Web.FlowElements;
@@ -35,8 +36,6 @@ public class Downloader : Node
[TextVariable(1)]
public string Url { get; set; } = null!;
private static HttpClient? client;
/// <inheritdoc />
public override int Execute(NodeParameters args)
{
@@ -48,7 +47,7 @@ public class Downloader : Node
return -1;
}
var result = Download(args.Logger!, url, args.TempPath, (percent) =>
var result = DownloadHelper.Download(args.Logger!, url, args.TempPath, (percent) =>
{
args.PartPercentageUpdate?.Invoke(percent);
});
@@ -64,196 +63,4 @@ public class Downloader : Node
return 1;
}
/// <summary>
/// Performs the download
/// </summary>
/// <param name="logger">the logger to use</param>
/// <param name="url">the URL to download</param>
/// <param name="destinationPath">the destination path</param>
/// <param name="percentUpdate">the percent update</param>
/// <returns>the name of the file if successful, otherwise an error</returns>
public Result<string> Download(ILogger logger, string url, string destinationPath, Action<float> percentUpdate)
{
if (client == null)
{
var handler = new HttpClientHandler
{
AllowAutoRedirect = true,
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate
};
client = new HttpClient(handler);
client.DefaultRequestHeaders.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
}
try
{
var tempFile = Path.Combine(destinationPath, Guid.NewGuid().ToString());
using (var response = client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead).Result)
{
if (!response.IsSuccessStatusCode)
{
return Result<string>.Fail($"Failed to download URL: {url}. Status code: {response.StatusCode}");
}
var contentType = response.Content.Headers.ContentType?.MediaType;
if(string.IsNullOrEmpty(contentType) == false)
logger?.ILog("ContentType: " + contentType);
var fileExtension = GetFileExtensionFromContentType(contentType);
// Check if the URL response contains a filename
if (response.Content.Headers.ContentDisposition?.FileName != null)
{
var sanitizedFileName = SanitizeFileName(response.Content.Headers.ContentDisposition.FileName.Trim('"'));
tempFile = Path.Combine(destinationPath, sanitizedFileName);
}
else
{
if (fileExtension == null)
{
// Check for common file headers if the content type is not recognized
var buffer = new byte[512];
using (var contentStream = response.Content.ReadAsStreamAsync().Result)
{
contentStream.Read(buffer, 0, buffer.Length);
fileExtension = GetFileExtensionFromHeader(buffer) ?? ".html";
contentStream.Position = 0; // Reset stream position for reading again
}
}
tempFile += fileExtension;
}
using (var contentStream = response.Content.ReadAsStreamAsync().Result)
using (var fileStream = new FileStream(tempFile, FileMode.Create, FileAccess.Write, FileShare.None))
{
var totalBytes = response.Content.Headers.ContentLength ?? -1L;
var totalRead = 0L;
var buffer = new byte[8192];
var isMoreToRead = true;
while (isMoreToRead)
{
var read = contentStream.ReadAsync(buffer, 0, buffer.Length).Result;
if (read == 0)
{
isMoreToRead = false;
continue;
}
fileStream.WriteAsync(buffer, 0, read).Wait();
totalRead += read;
if (totalBytes != -1)
{
var progress = (float)totalRead / totalBytes;
percentUpdate?.Invoke(progress);
}
}
}
}
logger?.ILog($"Downloaded file saved to: {tempFile}");
return tempFile;
}
catch (Exception ex)
{
return Result<string>.Fail($"Exception during download: {ex.Message}");
}
}
/// <summary>
/// Gets the file extension from the content type.
/// </summary>
/// <param name="contentType">The content type.</param>
/// <returns>The corresponding file extension, or null if not recognized.</returns>
private string? GetFileExtensionFromContentType(string? contentType)
{
switch (contentType)
{
case "text/html": return ".html";
case "image/jpeg": return ".jpg";
case "image/png": return ".png";
case "image/gif": return ".gif";
case "application/pdf": return ".pdf";
case "application/zip": return ".zip";
case "application/json": return ".json";
case "text/plain": return ".txt";
case "audio/mpeg": return ".mp3";
case "video/mp4": return ".mp4";
case "application/vnd.ms-excel": return ".xls";
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": return ".xlsx";
case "application/msword": return ".doc";
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": return ".docx";
case "application/vnd.ms-powerpoint": return ".ppt";
case "application/vnd.openxmlformats-officedocument.presentationml.presentation": return ".pptx";
case "application/x-rar-compressed": return ".rar";
case "application/x-tar": return ".tar";
case "application/x-7z-compressed": return ".7z";
// Add more content types and their corresponding file extensions as needed
default: return null;
}
}
/// <summary>
/// Gets the file extension from the file header bytes.
/// </summary>
/// <param name="fileHeader">The first few bytes of the file to identify its type.</param>
/// <returns>The corresponding file extension, or null if not recognized.</returns>
private string? GetFileExtensionFromHeader(byte[] fileHeader)
{
// Implement logic to identify file types based on header bytes
// Example: Check for common file signatures
if (fileHeader.Length >= 4)
{
// PDF file signature
if (fileHeader[0] == 0x25 && fileHeader[1] == 0x50 && fileHeader[2] == 0x44 && fileHeader[3] == 0x46)
{
return ".pdf";
}
// ZIP file signature
if (fileHeader[0] == 0x50 && fileHeader[1] == 0x4B &&
(fileHeader[2] == 0x03 || fileHeader[2] == 0x05 || fileHeader[2] == 0x07) && fileHeader[3] == 0x08)
{
return ".zip";
}
// PNG file signature
if (fileHeader[0] == 0x89 && fileHeader[1] == 0x50 && fileHeader[2] == 0x4E && fileHeader[3] == 0x47)
{
return ".png";
}
// JPEG file signature
if (fileHeader[0] == 0xFF && fileHeader[1] == 0xD8 && fileHeader[fileHeader.Length - 2] == 0xFF &&
fileHeader[fileHeader.Length - 1] == 0xD9)
{
return ".jpg";
}
}
return null;
}
/// <summary>
/// Sanitizes the filename to ensure it does not contain any path traversal characters or invalid characters.
/// </summary>
/// <param name="fileName">The filename to sanitize.</param>
/// <returns>The sanitized filename.</returns>
private string SanitizeFileName(string fileName)
{
// Remove any path traversal characters
fileName = Regex.Replace(fileName, @"\.\.\/|\\|\.\.\\|\/", string.Empty);
// Only allow safe characters in the filename
fileName = Regex.Replace(fileName, @"[^a-zA-Z0-9_\-\.]", "_");
return fileName;
}
}

View File

@@ -0,0 +1,51 @@
using FileFlows.Web.Helpers;
namespace FileFlows.Web.FlowElements;
/// <summary>
/// Input for a URL
/// </summary>
public class InputUrl : Node
{
/// <inheritdoc />
public override int Outputs => 1;
/// <inheritdoc />
public override FlowElementType Type => FlowElementType.Input;
/// <inheritdoc />
public override string Icon => "fas fa-globe";
/// <inheritdoc />
public override string HelpUrl => "https://fileflows.com/docs/plugins/web/input-url";
/// <inheritdoc />
public override string Group => "Web";
/// <summary>
/// Gets or sets if this should download the URL
/// </summary>
[Boolean(1)]
public bool Download { get; set; }
/// <inheritdoc />
public override int Execute(NodeParameters args)
{
string url = args.WorkingFile;
args.Variables["Url"] = url;
if (Download == false)
return 1;
var result = DownloadHelper.Download(args.Logger!, url, args.TempPath, (percent) =>
{
args.PartPercentageUpdate?.Invoke(percent);
});
if(result.Failed(out var error))
{
args.FailureReason = error;
args.Logger?.ELog(error);
return -1;
}
args.SetWorkingFile(result.Value);
return 1;
}
}

View File

@@ -1,5 +1,6 @@
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace FileFlows.Web.FlowElements;
@@ -33,7 +34,10 @@ public class HtmlImageParser : HtmlParser
protected override string VariableName => "ImageUrls";
/// <inheritdoc />
protected override List<string> ParseHtml(ILogger? logger, string html)
protected override List<string> ParseHtml(NodeParameters args, string html)
=> ParseHtmlForUrls(args, html, ["img"], ["src", "content"]);
private List<string> ParseHtmlOld(ILogger? logger, string html)
{
var imageUrls = new List<string>();
var regex = new Regex("<img[^>]+src=(\"([^\"]*)\"|'([^']*)'|([^\\s>]+))", RegexOptions.IgnoreCase);

View File

@@ -33,18 +33,53 @@ public class HtmlLinkParser : HtmlParser
protected override string VariableName => "Links";
/// <inheritdoc />
protected override List<string> ParseHtml(ILogger? logger, string html)
protected override List<string> ParseHtml(NodeParameters args, string html)
=> ParseHtmlForUrls(args, html, ["a"], ["href"]);
private List<string> ParseHtmlOld(ILogger? logger, string html)
{
var urls = new List<string>();
var regex = new Regex("<a[^>]+href=(\"([^\"]*)\"|'([^']*)'|([^\\s>]+))", RegexOptions.IgnoreCase);
var matches = regex.Matches(html);
string? baseUrl = null;
if (Variables.TryGetValue("Url", out var oUrl) && oUrl is string sBaseUrl)
{
try
{
var uri = new Uri(sBaseUrl);
// Get the absolute path without the query parameters
baseUrl = uri.GetLeftPart(UriPartial.Path);
// Ensure the path ends with a slash
if (baseUrl.EndsWith("/") == false)
baseUrl += "/";
// Use the folderPath as needed
logger?.ILog("Base URL: " + baseUrl);
}
catch (Exception)
{
// Ignored
}
}
foreach (Match match in matches)
{
if (match.Groups.Count > 1)
{
var url = match.Groups[1].Value.TrimStart('"', '\'').TrimEnd('"', '\'');
urls.Add(WebUtility.HtmlDecode(url));
url = WebUtility.HtmlDecode(url);
if (baseUrl != null && Regex.IsMatch(url, "^http(s)://", RegexOptions.IgnoreCase) == false)
{
logger?.ILog("Relative URL: " + url);
if (url.StartsWith("/"))
url = url[1..];
url = baseUrl + url;
logger?.ILog("Absolute URL: " + url);
}
urls.Add(url);
}
}

View File

@@ -1,5 +1,6 @@
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace FileFlows.Web.FlowElements;
@@ -50,7 +51,7 @@ public abstract class HtmlParser : Node
var html = result.Value;
var list = ParseHtml(args.Logger, html);
var list = ParseHtml(args, html);
var pattern = args.ReplaceVariables(Pattern ?? string.Empty, stripMissing: true);
if (string.IsNullOrWhiteSpace(pattern) == false)
@@ -87,7 +88,8 @@ public abstract class HtmlParser : Node
args.Logger?.ILog("Found item: " + item);
}
args.Variables[VariableName] = list;
if(string.IsNullOrWhiteSpace(VariableName) == false)
args.Variables[VariableName] = list;
// current list is the default current list FileFLows will use in a list flow element if no list is specified
args.Variables["CurrentList"] = list;
@@ -97,10 +99,10 @@ public abstract class HtmlParser : Node
/// <summary>
/// Parses the HTML
/// </summary>
/// <param name="logger">the logger to use</param>
/// <param name="args">the node parameters</param>
/// <param name="html">the HTML to parse</param>
/// <returns>the items found while pasrsing</returns>
protected abstract List<string> ParseHtml(ILogger? logger, string html);
protected abstract List<string> ParseHtml(NodeParameters args, string html);
/// <summary>
/// Gets the file content
@@ -138,4 +140,60 @@ public abstract class HtmlParser : Node
return File.ReadAllText(localFileResult.Value);
}
/// <summary>
/// Parses the HTML for the specified tags and attributes
/// </summary>
/// <param name="args">the node parameters</param>
/// <param name="html">the HTML to parse</param>
/// <param name="tags">the HTML tags to look for</param>
/// <param name="attributes">the attributes to look for</param>
/// <returns>a list of matching URLs</returns>
protected List<string> ParseHtmlForUrls(NodeParameters args, string html, string[] tags, string[] attributes)
{
var htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
Uri? baseUri = null;
if (args.Variables.TryGetValue("Url", out var oUrl) && oUrl is string sBaseUrl)
{
baseUri = new Uri(sBaseUrl);
args.Logger?.ILog("Base URL: " + baseUri);
}
List<string> results = new();
foreach (var tag in tags)
{
var nodes = htmlDoc.DocumentNode.SelectNodes($"//{tag}");
if (nodes == null) continue;
foreach (var ele in nodes)
{
foreach (var att in attributes)
{
var srcValue = ele.GetAttributeValue(att, string.Empty);
if (!string.IsNullOrEmpty(srcValue))
{
if (srcValue.StartsWith("http", StringComparison.OrdinalIgnoreCase))
{
results.Add(srcValue);
}
else if (baseUri != null)
{
if (Uri.TryCreate(srcValue, UriKind.Relative, out var relativeSrcUri))
{
var absoluteSrcUri = new Uri(baseUri, relativeSrcUri);
results.Add(absoluteSrcUri.ToString());
}
}
}
}
}
}
return results;
}
}

View File

@@ -1,39 +0,0 @@
namespace FileFlows.Web.FlowElements;
/// <summary>
///
/// </summary>
public class UrlToRelativePath : Node
{
public override int Inputs => 1;
public override int Outputs => 2;
public override string Icon =>
/// <summary>
/// Gets or sets the URL to get a path for
/// </summary>
[TextVariable(1)]
public string Url { get; set; } = null!;
public override int Execute(NodeParameters args)
{
// Create a Uri object from the URL
var uri = new Uri(x);
// Get the path without the query
var path = uri.AbsolutePath;
// Get the query part and replace the '=' and '&' with '-'
var query = uri.Query.TrimStart('?').Replace('=', '-').Replace('&', '/');
// Combine the path and the modified query
var fakePath = path.TrimEnd('/') + (string.IsNullOrEmpty(query) ? string.Empty : "/" + query);
// Remove leading slash
if (fakePath.StartsWith("/"))
{
fakePath = fakePath.Substring(1);
}
}
}

View File

@@ -1,4 +1,4 @@
namespace FileFlows.Web;
namespace FileFlows.Web.FlowElements;
using FileFlows.Plugin;
using FileFlows.Plugin.Attributes;
@@ -22,6 +22,8 @@ public class WebRequest : Node
public override string Icon => "fas fa-globe";
/// <inheritdoc />
public override string HelpUrl => "https://fileflows.com/docs/plugins/web/web-request";
/// <inheritdoc />
public override string Group => "Web";
/// <summary>
/// Gets or sets the URL