hatchet/frontend/docs/scripts/generate-llms.ts

/**
 * Generate llms.txt, llms-full.txt, and per-page markdown files from the
 * Hatchet documentation.
 *
 * This script reads the MDX documentation pages, resolves Snippet references
 * to inline code, expands UniversalTabs into labeled language sections, and
 * converts JSX components to plain Markdown.
 *
 * Usage:
 *   tsx scripts/generate-llms.ts                          # all languages
 *   tsx scripts/generate-llms.ts --languages python       # Python only
 *   tsx scripts/generate-llms.ts --languages python,typescript
 */

import fs from "node:fs";
import path from "node:path";
import { snippets } from "../lib/generated/snippets/index.js";

// ---------------------------------------------------------------------------
// Paths
// ---------------------------------------------------------------------------
const SCRIPT_DIR = path.dirname(new URL(import.meta.url).pathname);
const DOCS_ROOT = path.resolve(SCRIPT_DIR, "..");
const PAGES_DIR = path.join(DOCS_ROOT, "pages");
const OUTPUT_DIR = path.join(DOCS_ROOT, "public");

const DOCS_BASE_URL = "https://docs.hatchet.run";

const LANGUAGE_EXTENSIONS: Record<string, string> = {
  python: "python",
  typescript: "typescript",
  go: "go",
};

const TAB_LABEL_TO_LANG: Record<string, string> = {
  python: "python",
  typescript: "typescript",
  go: "go",
};

// ---------------------------------------------------------------------------
// Snippet resolution
// ---------------------------------------------------------------------------
type SnippetNode = Record<string, any>;

function resolveSnippetPath(
  tree: SnippetNode,
  dotpath: string,
): SnippetNode | null {
  let cleaned = dotpath;
  if (cleaned.startsWith("snippets.")) {
    cleaned = cleaned.slice("snippets.".length);
  }
  const parts = cleaned.split(".");
  let current: any = tree;
  for (const part of parts) {
    if (current && typeof current === "object" && part in current) {
      current = current[part];
    } else {
      return null;
    }
  }
  if (current && typeof current === "object" && "content" in current) {
    return current as SnippetNode;
  }
  return null;
}

// ---------------------------------------------------------------------------
// _meta.js parsing
// ---------------------------------------------------------------------------
interface DocPage {
  title: string;
  slug: string;
  href: string;
  filepath: string;
  section: string;
}

/**
 * Parse a _meta.js file into a plain object.
 *
 * **Limitations:** This uses regex to convert simple JS object literals to
 * JSON. It only supports _meta.js files that export a plain object with:
 *   - Simple unquoted or quoted string keys (no computed `[expr]` keys)
 *   - String or plain-object values (no function calls, template literals,
 *     spread operators, or variable references)
 *   - No inline or block comments
 *
 * If your _meta.js file uses any of these unsupported constructs, either
 * simplify it or extend this parser (e.g. with @babel/parser + eval).
 */
function parseMetaJs(filepath: string): Record<string, any> {
  const raw = fs.readFileSync(filepath, "utf-8");
  let content = raw.replace("export default ", "");
  // Quote unquoted object keys for JSON parsing
  const pattern = /^(\s*)([a-zA-Z_$][a-zA-Z0-9_$-]*)\s*:/gm;
  content = content.replace(pattern, '$1"$2":');
  // Apply twice to catch keys that were adjacent
  content = content.replace(pattern, '$1"$2":');
  // Quote unquoted keys inside inline objects (e.g. { collapsed: true })
  content = content.replace(
    /(\{\s*)([a-zA-Z_$][a-zA-Z0-9_$-]*)\s*:/g,
    '$1"$2":',
  );
  // Remove trailing commas before closing braces
  content = content.replace(/,(\s*\n?\s*})(\s*);?/g, "$1");
  // Strip trailing semicolon from export default {...};
  content = content.replace(/\s*;\s*$/, "");

  try {
    return JSON.parse(content);
  } catch (err) {
    const message =
      err instanceof Error ? err.message : String(err);
    throw new Error(
      `Failed to parse _meta.js at ${filepath}: ${message}.\n` +
        `The regex-based parser only supports simple object literals ` +
        `(no computed keys, spread operators, comments, or expressions). ` +
        `Simplify the file or switch to a proper JS parser.\n` +
        `--- transformed content ---\n${content}`,
    );
  }
}

function isDocPage(key: string, value: any): boolean {
  if (key.trim().startsWith("--")) return false;
  if (key.trim().startsWith("_")) return false;
  if (typeof value === "string") return true;
  if (typeof value === "object" && value !== null) {
    if (value.display === "hidden") return false;
    if ("title" in value) return true;
  }
  return false;
}

function extractTitle(value: any): string {
  if (typeof value === "string") return value;
  if (typeof value === "object" && value !== null && "title" in value)
    return value.title;
  return "";
}

function collectPagesFromDir(
  dir: string,
  urlPrefix: string,
  sectionTitle: string,
  pages: DocPage[],
): void {
  const metaPath = path.join(dir, "_meta.js");
  if (!fs.existsSync(metaPath)) return;

  const meta = parseMetaJs(metaPath);

  for (const [key, value] of Object.entries(meta)) {
    if (!isDocPage(key, value)) continue;

    const title = extractTitle(value as any);
    const subDir = path.join(dir, key);
    const href = `${DOCS_BASE_URL}/${urlPrefix}/${key}`;

    // Check if this key is a folder with its own _meta.js (sub-section)
    const subMetaPath = path.join(subDir, "_meta.js");
    if (fs.existsSync(subMetaPath)) {
      // Add the index page for this folder if it exists and isn't hidden
      const indexMdx = path.join(subDir, "index.mdx");
      if (fs.existsSync(indexMdx)) {
        const indexValue = parseMetaJs(subMetaPath)["index"];
        if (!indexValue || (typeof indexValue === "object" && indexValue.display !== "hidden")) {
          pages.push({
            title: title || key,
            slug: key,
            href,
            filepath: indexMdx,
            section: sectionTitle,
          });
        }
      }
      // Recurse into sub-section
      collectPagesFromDir(subDir, `${urlPrefix}/${key}`, sectionTitle, pages);
      continue;
    }

    // Plain .mdx file
    let mdxPath = path.join(dir, key + ".mdx");
    if (!fs.existsSync(mdxPath)) {
      mdxPath = path.join(subDir, "index.mdx");
    }
    if (!fs.existsSync(mdxPath)) continue;

    pages.push({
      title: title || key,
      slug: key,
      href,
      filepath: mdxPath,
      section: sectionTitle,
    });
  }
}

function collectPages(): DocPage[] {
  const pages: DocPage[] = [];

  const rootMetaPath = path.join(PAGES_DIR, "_meta.js");
  if (!fs.existsSync(rootMetaPath)) return pages;

  const rootMeta = parseMetaJs(rootMetaPath);
  const sectionOrder = Object.keys(rootMeta).filter(
    (k) => !k.startsWith("_"),
  );

  for (const sectionKey of sectionOrder) {
    const sectionDir = path.join(PAGES_DIR, sectionKey);
    const sectionMetaPath = path.join(sectionDir, "_meta.js");

    const sectionValue = rootMeta[sectionKey] ?? {};
    const sectionTitle =
      typeof sectionValue === "object"
        ? extractTitle(sectionValue as any)
        : sectionKey;

    if (!fs.existsSync(sectionMetaPath)) {
      // Plain top-level .mdx file
      const mdxPath = path.join(PAGES_DIR, sectionKey + ".mdx");
      if (fs.existsSync(mdxPath)) {
        pages.push({
          title: sectionTitle || sectionKey,
          slug: sectionKey,
          href: `${DOCS_BASE_URL}/${sectionKey}`,
          filepath: mdxPath,
          section: sectionTitle || sectionKey,
        });
      }
      continue;
    }

    // Recurse into section directory
    collectPagesFromDir(sectionDir, sectionKey, sectionTitle, pages);
  }

  return pages;
}

// ---------------------------------------------------------------------------
// MDX -> Markdown conversion
// ---------------------------------------------------------------------------
function stripImportLines(text: string): string {
  const lines = text.split("\n");
  const result: string[] = [];
  let inImports = true;
  for (const line of lines) {
    if (inImports) {
      const stripped = line.trim();
      if (stripped.startsWith("import ") || stripped === "") continue;
      inImports = false;
    }
    result.push(line);
  }
  return result.join("\n");
}

function stripJsxComments(text: string): string {
  return text.replace(/\{\/\*[\s\S]*?\*\/\}/g, "");
}

function resolveSnippets(
  text: string,
  snippetTree: SnippetNode,
  languages: string[] | null,
): string {
  const pattern = /<Snippet\s+src=\{([\s\S]*?)\}\s*\/>/g;
  return text.replace(pattern, (_match, rawPath: string) => {
    const dotpath = rawPath.replace(/\s+/g, "").trim();
    const snippet = resolveSnippetPath(snippetTree, dotpath);
    if (!snippet) return `<!-- snippet not found: ${dotpath} -->`;

    const lang = snippet.language ?? "";
    if (languages && !languages.includes(lang)) return "";

    const langExt = LANGUAGE_EXTENSIONS[lang] ?? lang;
    const code = (snippet.content ?? "").trimEnd();
    return `\`\`\`${langExt}\n${code}\n\`\`\``;
  });
}

function convertCallouts(text: string): string {
  const pattern = /<Callout\s+type=["'](\w+)["']\s*>([\s\S]*?)<\/Callout>/g;
  return text.replace(pattern, (_match, calloutType: string, content: string) => {
    const label = calloutType.charAt(0).toUpperCase() + calloutType.slice(1);
    const trimmed = content.trim();
    const lines = trimmed.split("\n");
    if (lines.length === 1) {
      return `> **${label}:** ${trimmed}`;
    }
    return (
      `> **${label}:** ${lines[0]}\n` +
      lines
        .slice(1)
        .map((l) => (l.trim() ? `> ${l}` : ">"))
        .join("\n")
    );
  });
}

// ---------------------------------------------------------------------------
// Tab expansion
// ---------------------------------------------------------------------------
function dedentTabContent(text: string): string {
  const lines = text.split("\n");
  let inFence = false;
  // Use a boolean array instead of Set to avoid es5 iteration issues
  const isProseLine: boolean[] = new Array(lines.length).fill(false);

  for (let i = 0; i < lines.length; i++) {
    const stripped = lines[i].trimStart();
    if (stripped.startsWith("```")) {
      inFence = !inFence;
      isProseLine[i] = true;
      continue;
    }
    if (!inFence) {
      isProseLine[i] = true;
    }
  }

  let minIndent: number | null = null;
  for (let i = 0; i < lines.length; i++) {
    if (!isProseLine[i]) continue;
    const line = lines[i];
    const stripped = line.trim();
    if (!stripped) continue;
    if (stripped.startsWith("<") || stripped.startsWith("{/*")) continue;
    const indent = line.length - line.trimStart().length;
    if (indent === 0) continue;
    if (minIndent === null || indent < minIndent) {
      minIndent = indent;
    }
  }

  if (!minIndent) return text;

  const result: string[] = [];
  for (let i = 0; i < lines.length; i++) {
    if (
      isProseLine[i] &&
      lines[i].length >= minIndent &&
      lines[i].slice(0, minIndent).trim() === ""
    ) {
      result.push(lines[i].slice(minIndent));
    } else {
      result.push(lines[i]);
    }
  }
  return result.join("\n");
}

function extractTabContents(
  inner: string,
  items: string[],
): [string, string][] {
  const result: [string, string][] = [];
  let tabIdx = 0;
  let pos = 0;

  while (pos < inner.length) {
    const openMatch = inner.slice(pos).match(/<Tabs\.Tab(?:\s+[^>]*)?>/);
    if (!openMatch || openMatch.index === undefined) break;

    const start = pos + openMatch.index + openMatch[0].length;
    let depth = 1;
    let scan = start;

    while (scan < inner.length && depth > 0) {
      const remaining = inner.slice(scan);
      const nextOpen = remaining.match(/<Tabs\.Tab(?:\s+[^>]*)?>/);
      const nextClose = remaining.match(/<\/Tabs\.Tab>/);

      if (!nextClose || nextClose.index === undefined) break;

      if (
        nextOpen &&
        nextOpen.index !== undefined &&
        nextOpen.index < nextClose.index
      ) {
        depth++;
        scan += nextOpen.index + nextOpen[0].length;
      } else {
        depth--;
        if (depth === 0) {
          let content = inner.slice(start, scan + nextClose.index);
          content = dedentTabContent(content);
          const label =
            tabIdx < items.length ? items[tabIdx] : `Tab ${tabIdx + 1}`;
          result.push([label, content]);
          tabIdx++;
          scan += nextClose.index + nextClose[0].length;
        } else {
          scan += nextClose.index + nextClose[0].length;
        }
      }
    }

    pos = scan;
  }

  return result;
}

function expandUniversalTabs(
  text: string,
  languages: string[] | null,
): string {
  const pattern =
    /<UniversalTabs\s+items=\{(\[[^\]]*\])\}(?:\s+optionKey=["']([^"']*)["'])?(?:\s+variant=["'][^"']*["'])?\s*>((?:(?!<UniversalTabs)[\s\S])*?)<\/UniversalTabs>/g;

  function processTabsBlock(
    _match: string,
    itemsStr: string,
    optionKey: string | undefined,
    inner: string,
  ): string {
    let items = itemsStr.match(/"([^"]*)"/g)?.map((s) => s.slice(1, -1)) ?? [];
    if (items.length === 0) {
      items = itemsStr.match(/'([^']*)'/g)?.map((s) => s.slice(1, -1)) ?? [];
    }

    const isLanguageTabs = !optionKey || optionKey === "language";
    const tabContents = extractTabContents(inner, items);

    const parts: string[] = [];
    for (const [label, content] of tabContents) {
      const langKey = TAB_LABEL_TO_LANG[label.toLowerCase()];

      if (isLanguageTabs && langKey && languages && !languages.includes(langKey))
        continue;

      parts.push(`#### ${label}\n\n${content.trim()}`);
    }

    return parts.join("\n\n");
  }

  // Repeatedly process innermost first (handles nesting)
  let prev: string | null = null;
  while (prev !== text) {
    prev = text;
    text = text.replace(pattern, processTabsBlock);
  }

  return text;
}

function expandStandaloneTabs(text: string): string {
  const pattern =
    /<Tabs\s+items=\{(\[[\s\S]*?\])\}\s*>([\s\S]*?)<\/Tabs>/g;

  return text.replace(pattern, (_match, itemsStr: string, inner: string) => {
    let items = itemsStr.match(/"([^"]*)"/g)?.map((s) => s.slice(1, -1)) ?? [];
    if (items.length === 0) {
      items = itemsStr.match(/'([^']*)'/g)?.map((s) => s.slice(1, -1)) ?? [];
    }

    const tabContents = extractTabContents(inner, items);
    const parts: string[] = [];
    for (const [label, content] of tabContents) {
      parts.push(`#### ${label}\n\n${content.trim()}`);
    }
    return parts.join("\n\n");
  });
}

// ---------------------------------------------------------------------------
// Other component converters
// ---------------------------------------------------------------------------
function convertSteps(text: string): string {
  text = text.replace(/<Steps\s*\/?>/g, "");
  text = text.replace(/<\/Steps>/g, "");
  return text;
}

function convertCards(text: string): string {
  text = text.replace(/<Cards\s*\/?>/g, "");
  text = text.replace(/<\/Cards>/g, "");

  text = text.replace(
    /<Card\s+([\s\S]*?)(?:>([\s\S]*?)<\/Card>|\/>)/g,
    (_match, attrs: string, content?: string) => {
      const titleMatch = attrs.match(/title=["']([^"']*)["']/);
      const hrefMatch = attrs.match(/href=["']([^"']*)["']/);
      const title = titleMatch?.[1] ?? "";
      const href = hrefMatch?.[1] ?? "";
      const trimContent = content?.trim() ?? "";

      if (href) {
        return `- [${title}](${href})${trimContent ? ": " + trimContent : ""}`;
      }
      return `- **${title}**${trimContent ? ": " + trimContent : ""}`;
    },
  );
  return text;
}

function convertFileTree(text: string): string {
  function walkFileTree(
    content: string,
    lines: string[],
    depth: number,
  ): void {
    const folderPattern =
      /<FileTree\.Folder\s+name=["']([^"']*)["'][^>]*>([\s\S]*?)<\/FileTree\.Folder>/g;
    let folderMatch: RegExpExecArray | null;
    while ((folderMatch = folderPattern.exec(content)) !== null) {
      lines.push("  ".repeat(depth) + folderMatch[1] + "/");
      walkFileTree(folderMatch[2], lines, depth + 1);
    }
    const filePattern =
      /<FileTree\.File\s+name=["']([^"']*)["'][^>]*\s*\/>/g;
    let fileMatch: RegExpExecArray | null;
    while ((fileMatch = filePattern.exec(content)) !== null) {
      lines.push("  ".repeat(depth) + fileMatch[1]);
    }
  }

  return text.replace(
    /<FileTree>([\s\S]*?)<\/FileTree>/g,
    (_match, inner: string) => {
      const lines: string[] = [];
      walkFileTree(inner, lines, 0);
      return "```\n" + lines.join("\n") + "\n```";
    },
  );
}

function stripJsxComponents(text: string): string {
  // Self-closing JSX tags
  text = text.replace(/<[A-Z]\w*(?:\.\w+)*\s*[^>]*\/\s*>/g, "");
  // Opening/closing JSX tags
  text = text.replace(/<\/?[A-Z]\w*(?:\.\w+)*\s*[^>]*>/g, "");
  return text;
}

function resolveMdxComponentImports(
  text: string,
  filepath: string,
  snippetTree: SnippetNode,
  languages: string[] | null,
  depth: number = 0,
): string {
  if (depth > 10) {
    console.warn(
      `[generate-llms] resolveMdxComponentImports: recursion depth limit ` +
        `(10) reached while processing "${filepath}". This likely indicates ` +
        `circular MDX imports. The remaining component references will not ` +
        `be resolved.`,
    );
    return text;
  }

  const mdxImportPattern =
    /import\s+(\w+)\s+from\s+["']([^"']*\.mdx)["']/g;

  // Collect all MDX component imports first
  const imports: Array<{ componentName: string; relPath: string }> = [];
  let importMatch: RegExpExecArray | null;
  while ((importMatch = mdxImportPattern.exec(text)) !== null) {
    imports.push({
      componentName: importMatch[1],
      relPath: importMatch[2],
    });
  }

  for (const imp of imports) {
    const importedFilePath = path.resolve(path.dirname(filepath), imp.relPath);
    if (!fs.existsSync(importedFilePath)) {
      // Fall back to a comment if the file can't be found
      text = text.replace(
        new RegExp(`<${imp.componentName}\\s*/\\s*>`, "g"),
        `<!-- Could not resolve ${imp.relPath} -->`,
      );
      continue;
    }

    // Read the imported MDX and recursively convert it
    const importedRaw = fs.readFileSync(importedFilePath, "utf-8");
    const importedMd = convertMdxToMarkdown(
      importedRaw,
      snippetTree,
      languages,
      importedFilePath,
      depth + 1,
    );

    // Replace all usages of <ComponentName /> with the inlined content
    text = text.replace(
      new RegExp(`<${imp.componentName}\\s*/\\s*>`, "g"),
      importedMd.trim(),
    );
  }

  return text;
}

function cleanBlankLines(text: string): string {
  return text.replace(/\n{4,}/g, "\n\n\n");
}

// ---------------------------------------------------------------------------
// Full pipeline
// ---------------------------------------------------------------------------
function convertMdxToMarkdown(
  content: string,
  snippetTree: SnippetNode,
  languages: string[] | null,
  filepath?: string,
  depth?: number,
): string {
  let text = content;

  if (filepath) {
    text = resolveMdxComponentImports(
      text,
      filepath,
      snippetTree,
      languages,
      depth ?? 0,
    );
  }
  text = stripImportLines(text);
  text = stripJsxComments(text);
  text = convertCallouts(text);
  text = resolveSnippets(text, snippetTree, languages);
  text = expandUniversalTabs(text, languages);
  text = expandStandaloneTabs(text);
  text = convertSteps(text);
  text = convertCards(text);
  text = convertFileTree(text);
  text = stripJsxComponents(text);
  text = cleanBlankLines(text);

  return text.trim() + "\n";
}

// ---------------------------------------------------------------------------
// MiniSearch index generation
// ---------------------------------------------------------------------------
import MiniSearch from "minisearch";

import { MINISEARCH_OPTIONS } from "../lib/search-config.js";

interface SearchDoc {
  id: string;
  title: string;
  content: string;
  codeIdentifiers: string;
  keywords: string;
  pageTitle: string;
  pageRoute: string;
}

/**
 * Extract the keywords string from a <Keywords keywords="..." /> component
 * in raw MDX source.  Returns an empty string if none is found.
 */
function extractKeywords(rawMdx: string): string {
  const match = rawMdx.match(/<Keywords\s+keywords=["']([^"']*)["']\s*\/>/);
  return match ? match[1] : "";
}

/**
 * Extract compound code identifiers from fenced code blocks in markdown.
 * Finds dotted identifiers (e.g. hatchet.task, ctx.spawn, hatchet.workflow)
 * and other notable code patterns, returning them as a space-separated string.
 */
function extractCodeIdentifiers(markdown: string): string {
  const identifiers = new Set<string>();
  const lines = markdown.split("\n");
  let inFence = false;
  let fenceMarker: string | null = null;

  for (const line of lines) {
    const trimmed = line.trimStart();
    const backtickMatch = trimmed.match(/^(`{3,})/);
    if (backtickMatch) {
      if (fenceMarker === null) {
        fenceMarker = backtickMatch[1];
        inFence = true;
      } else if (backtickMatch[1].length >= fenceMarker.length) {
        fenceMarker = null;
        inFence = false;
      }
      continue;
    }

    if (!inFence) continue;

    // Dotted identifiers: hatchet.task, ctx.spawn, hatchet.workflow, etc.
    const dottedPattern = /[a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)+/g;
    let m: RegExpExecArray | null;
    while ((m = dottedPattern.exec(line)) !== null) {
      identifiers.add(m[0].toLowerCase());
    }

    // Decorated identifiers: @hatchet.task, @hatchet.workflow
    const decoratorPattern = /@([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)/g;
    while ((m = decoratorPattern.exec(line)) !== null) {
      identifiers.add(m[1].toLowerCase());
    }
  }

  return Array.from(identifiers).join(" ");
}

/**
 * Convert heading text to a URL-friendly slug (matching Nextra's anchor generation).
 */
function slugify(text: string): string {
  return text
    .toLowerCase()
    .replace(/[^\w\s-]/g, "")
    .replace(/\s+/g, "-")
    .replace(/-+/g, "-")
    .replace(/^-|-$/g, "");
}

/**
 * Split markdown content into sections by h2 headings.
 * Returns an array of { heading, slug, content } objects.
 * The first element has heading="" for content before the first h2.
 */
function splitByH2(
  markdown: string,
): Array<{ heading: string; slug: string; content: string }> {
  const lines = markdown.split("\n");
  const sections: Array<{ heading: string; slug: string; content: string }> = [];
  let currentHeading = "";
  let currentSlug = "";
  let currentLines: string[] = [];
  let fenceMarker: string | null = null; // tracks the opening fence (e.g. "```" or "````")

  for (const line of lines) {
    // Track fenced code blocks so we don't split on ## inside them.
    // A fence opens with 3+ backticks and closes only when we see at
    // least the same number of backticks (CommonMark spec).
    const trimmed = line.trimStart();
    const backtickMatch = trimmed.match(/^(`{3,})/);
    if (backtickMatch) {
      if (fenceMarker === null) {
        fenceMarker = backtickMatch[1]; // open fence
      } else if (backtickMatch[1].length >= fenceMarker.length) {
        fenceMarker = null; // close fence
      }
      // else: fewer backticks than the opening fence — just content
    }

    const h2Match = fenceMarker === null && line.match(/^## (.+)$/);
    if (h2Match) {
      // Flush the previous section
      const content = currentLines.join("\n").trim();
      if (content || currentHeading) {
        sections.push({
          heading: currentHeading,
          slug: currentSlug,
          content,
        });
      }
      currentHeading = h2Match[1].trim();
      currentSlug = slugify(currentHeading);
      currentLines = [];
    } else {
      currentLines.push(line);
    }
  }

  // Flush the last section
  const content = currentLines.join("\n").trim();
  if (content || currentHeading) {
    sections.push({
      heading: currentHeading,
      slug: currentSlug,
      content,
    });
  }

  return sections;
}

function buildSearchIndex(
  pages: DocPage[],
  snippetTree: SnippetNode,
  languages: string[] | null,
): string {
  const miniSearch = new MiniSearch<SearchDoc>(MINISEARCH_OPTIONS);

  const docs: SearchDoc[] = [];
  const seenIds = new Set<string>();
  for (const page of pages) {
    const raw = fs.readFileSync(page.filepath, "utf-8");
    const md = convertMdxToMarkdown(raw, snippetTree, languages, page.filepath);
    const urlPath = page.href.replace(DOCS_BASE_URL + "/", "");
    const pageRoute = `hatchet://docs/${urlPath}`;

    const keywords = extractKeywords(raw);
    const sections = splitByH2(md);

    for (const section of sections) {
      if (!section.content.trim()) continue;

      let id = section.slug
        ? `${pageRoute}#${section.slug}`
        : pageRoute;

      if (seenIds.has(id)) {
        let suffix = 2;
        while (seenIds.has(`${id}-${suffix}`)) suffix++;
        id = `${id}-${suffix}`;
      }
      seenIds.add(id);

      const title = section.heading || page.title;

      docs.push({
        id,
        title,
        content: section.content,
        codeIdentifiers: extractCodeIdentifiers(section.content),
        keywords,
        pageTitle: page.title,
        pageRoute,
      });
    }
  }

  miniSearch.addAll(docs);
  return JSON.stringify(miniSearch);
}

// ---------------------------------------------------------------------------
// Output generation
// ---------------------------------------------------------------------------
function generateLlmsTxt(pages: DocPage[]): string {
  const lines: string[] = [
    "# Hatchet Documentation",
    "",
    "> Hatchet is a distributed task queue and workflow engine for modern " +
      "applications. It provides durable execution, concurrency control, " +
      "rate limiting, and observability for background tasks and workflows " +
      "in Python, TypeScript, and Go.",
    "",
  ];

  let currentSection = "";
  for (const page of pages) {
    if (page.section !== currentSection) {
      currentSection = page.section;
      lines.push(`## ${currentSection}`);
      lines.push("");
    }
    lines.push(`- [${page.title}](${page.href})`);
  }

  lines.push("");
  return lines.join("\n");
}

function generateLlmsFullTxt(
  pages: DocPage[],
  snippetTree: SnippetNode,
  languages: string[] | null,
): string {
  const parts: string[] = [
    "# Hatchet Documentation",
    "",
    "> Hatchet is a distributed task queue and workflow engine for modern " +
      "applications. It provides durable execution, concurrency control, " +
      "rate limiting, and observability for background tasks and workflows " +
      "in Python, TypeScript, and Go.",
    "",
  ];

  for (const page of pages) {
    const raw = fs.readFileSync(page.filepath, "utf-8");
    const md = convertMdxToMarkdown(raw, snippetTree, languages, page.filepath);
    parts.push(`---\n\n<!-- Source: ${page.href} -->\n`);
    parts.push(md);
    parts.push("");
  }

  return parts.join("\n");
}

function generatePerPageMarkdown(
  pages: DocPage[],
  snippetTree: SnippetNode,
  languages: string[] | null,
): void {
  const llmsDir = path.join(OUTPUT_DIR, "llms");

  for (const page of pages) {
    const raw = fs.readFileSync(page.filepath, "utf-8");
    const md = convertMdxToMarkdown(raw, snippetTree, languages, page.filepath);

    const urlPath = page.href.replace(DOCS_BASE_URL + "/", "");
    const outPath = path.join(llmsDir, urlPath + ".md");
    fs.mkdirSync(path.dirname(outPath), { recursive: true });
    fs.writeFileSync(outPath, md);

    // For index pages (e.g. home/index), also write at the section root
    // (e.g. home.md) so that /llms/home.md resolves correctly — Next.js
    // router.pathname for section roots is "/home", not "/home/index".
    if (page.slug === "index") {
      const sectionPath = urlPath.replace(/\/index$/, "");
      const sectionOutPath = path.join(llmsDir, sectionPath + ".md");
      fs.writeFileSync(sectionOutPath, md);
    }
  }

  console.log(
    `  Wrote ${pages.length} per-page markdown files to ${llmsDir}/`,
  );
}

// ---------------------------------------------------------------------------
// CLI & main
// ---------------------------------------------------------------------------
function parseArgs(): string[] | null {
  const idx = process.argv.indexOf("--languages");
  if (idx === -1 || idx + 1 >= process.argv.length) return null;

  const raw = process.argv[idx + 1];
  const langs = raw.split(",").map((l) => l.trim().toLowerCase());
  const valid = Object.keys(LANGUAGE_EXTENSIONS);
  for (const lang of langs) {
    if (!valid.includes(lang)) {
      console.error(
        `Unknown language: ${lang}. Valid: ${valid.sort().join(", ")}`,
      );
      process.exit(1);
    }
  }
  return langs;
}

function main(): void {
  const languages = parseArgs();

  console.log("Loading snippets...");
  const snippetTree = snippets as unknown as SnippetNode;

  console.log("Collecting pages from _meta.js files...");
  const pages = collectPages();
  console.log(`  Found ${pages.length} pages`);

  console.log("Generating llms.txt...");
  const llmsTxt = generateLlmsTxt(pages);

  console.log("Generating llms-full.txt...");
  const llmsFullTxt = generateLlmsFullTxt(pages, snippetTree, languages);

  console.log("Generating per-page markdown files...");
  generatePerPageMarkdown(pages, snippetTree, languages);

  console.log("Building MiniSearch index...");
  const searchIndexJson = buildSearchIndex(pages, snippetTree, languages);

  fs.mkdirSync(OUTPUT_DIR, { recursive: true });

  const llmsTxtPath = path.join(OUTPUT_DIR, "llms.txt");
  fs.writeFileSync(llmsTxtPath, llmsTxt);
  console.log(`  Wrote ${llmsTxtPath} (${llmsTxt.length} bytes)`);

  const llmsFullPath = path.join(OUTPUT_DIR, "llms-full.txt");
  fs.writeFileSync(llmsFullPath, llmsFullTxt);
  console.log(`  Wrote ${llmsFullPath} (${llmsFullTxt.length} bytes)`);

  const searchIndexPath = path.join(OUTPUT_DIR, "llms-search-index.json");
  fs.writeFileSync(searchIndexPath, searchIndexJson);
  console.log(
    `  Wrote ${searchIndexPath} (${searchIndexJson.length} bytes)`,
  );

  if (languages) {
    console.log(`  Languages: ${languages.join(", ")}`);
  } else {
    console.log("  Languages: all");
  }

  console.log("Done!");
}

main();