fix: Title content after period sometimes stripped in import (#9712)

closes #9694
This commit is contained in:
Tom Moor
2025-07-22 09:00:23 -04:00
committed by GitHub
parent 3466909666
commit b4836cd922
2 changed files with 35 additions and 1 deletions
+21
View File
@@ -37,6 +37,27 @@ describe("documentImporter", () => {
expect(response.title).toEqual("images");
});
it("should not strip content after period in title", async () => {
const user = await buildUser();
const fileName = "01. test";
const content = await fs.readFile(
path.resolve(__dirname, "..", "test", "fixtures", "images.docx")
);
const response = await sequelize.transaction((transaction) =>
documentImporter({
user,
mimeType:
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
fileName,
content,
ctx: createContext({ user, transaction }),
})
);
expect(response.text).toContain("This is a test document for images");
expect(response.title).toEqual("01. test");
});
it("should convert Word Document to markdown for application/octet-stream mimetype", async () => {
const user = await buildUser();
const fileName = "images.docx";
+14 -1
View File
@@ -1,4 +1,5 @@
import emojiRegex from "emoji-regex";
import mime from "mime-types";
import truncate from "lodash/truncate";
import parseTitle from "@shared/utils/parseTitle";
import { DocumentValidation } from "@shared/validations";
@@ -35,7 +36,19 @@ async function documentImporter({
fileName,
mimeType
);
let title = fileName.replace(/\.[^/.]+$/, "");
// find valid extensions and remove them from the title
const extensions = [
"docx",
"md",
"markdown",
"html",
...(mime.extensions[mimeType] ?? []),
];
let title = fileName.replace(
new RegExp(`\\.(${extensions.join("|")})$`, "i"),
""
);
// find and extract emoji near the beginning of the document.
const regex = emojiRegex();