mirror of
https://github.com/outline/outline.git
synced 2025-12-30 15:30:12 -06:00
1120 lines
35 KiB
TypeScript
1120 lines
35 KiB
TypeScript
// Forked from https://github.com/inkling/htmldiff.js/blob/master/js/htmldiff.js
|
|
|
|
// The MIT License (MIT)
|
|
|
|
// Copyright (c) 2012 The Network Inc. and contributors
|
|
// Copyright (c) 2022 idesis GmbH, Max-Keith-Straße 66 (E 11), D-45136 Essen, https://www.idesis.de
|
|
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy of
|
|
// this software and associated documentation files (the "Software"), to deal in
|
|
// the Software without restriction, including without limitation the rights to
|
|
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
// of the Software, and to permit persons to whom the Software is furnished to do
|
|
// so, subject to the following conditions:
|
|
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
/**
|
|
* htmldiff.js is a library that compares HTML content. It creates a diff between two
|
|
* HTML documents by combining the two documents and wrapping the differences with
|
|
* <ins> and <del> tags. Here is a high-level overview of how the diff works.
|
|
*
|
|
* 1. Tokenize the before and after HTML with htmlToTokens.
|
|
* 2. Generate a list of operations that convert the before list of tokens to the after
|
|
* list of tokens with calculateOperations, which does the following:
|
|
* a. Find all the matching blocks of tokens between the before and after lists of
|
|
* tokens with findMatchingBlocks. This is done by finding the single longest
|
|
* matching block with findMatch, then iteratively finding the next longest
|
|
* matching blocks that precede and follow the longest matching block.
|
|
* b. Determine insertions, deletions, and replacements from the matching blocks.
|
|
* This is done in calculateOperations.
|
|
* 3. Render the list of operations by wrapping tokens with <ins> and <del> tags where
|
|
* appropriate with renderOperations.
|
|
*
|
|
* Example usage:
|
|
*
|
|
* var htmldiff = require('htmldiff.js');
|
|
*
|
|
* htmldiff('<p>this is some text</p>', '<p>this is some more text</p>')
|
|
* == '<p>this is some <ins>more </ins>text</p>'
|
|
*
|
|
* htmldiff('<p>this is some text</p>', '<p>this is some more text</p>', 'diff-class')
|
|
* == '<p>this is some <ins class="diff-class">more </ins>text</p>'
|
|
*/
|
|
|
|
"use strict";
|
|
|
|
type Token = {
|
|
string: string;
|
|
key: string;
|
|
};
|
|
|
|
type Segment = {
|
|
beforeTokens: Token[];
|
|
afterTokens: Token[];
|
|
beforeIndex: number;
|
|
afterIndex: number;
|
|
|
|
beforeMap: Record<string, number[]>;
|
|
afterMap: Record<string, number[]>;
|
|
};
|
|
|
|
type MatchT = {
|
|
segment: Segment;
|
|
length: number;
|
|
|
|
startInBefore: number;
|
|
endInBefore: number;
|
|
startInAfter: number;
|
|
endInAfter: number;
|
|
|
|
segmentStartInBefore: number;
|
|
segmentStartInAfter: number;
|
|
segmentEndInBefore: number;
|
|
segmentEndInAfter: number;
|
|
};
|
|
|
|
type OperationType = "insert" | "delete" | "replace" | "equal" | "none";
|
|
|
|
type Operation = {
|
|
action: OperationType;
|
|
startInBefore: number;
|
|
endInBefore: number | null;
|
|
startInAfter: number | null;
|
|
endInAfter: number | null;
|
|
};
|
|
|
|
function isEndOfTag(char: string) {
|
|
return char === ">";
|
|
}
|
|
|
|
function isStartOfTag(char: string) {
|
|
return char === "<";
|
|
}
|
|
|
|
function isWhitespace(char: string) {
|
|
return /^\s+$/.test(char);
|
|
}
|
|
|
|
/**
|
|
* Determines if the given token is a tag.
|
|
*
|
|
* @param {string} token The token in question.
|
|
*
|
|
* @return {boolean|string} False if the token is not a tag, or the tag name otherwise.
|
|
*/
|
|
function isTag(token: string): boolean | string {
|
|
const match = token.match(/^\s*<([^!>][^>]*)>\s*$/);
|
|
return !!match && match[1].trim().split(" ")[0];
|
|
}
|
|
|
|
function isntTag(token: string) {
|
|
return !isTag(token);
|
|
}
|
|
|
|
function isStartofHTMLComment(word: string) {
|
|
return /^<!--/.test(word);
|
|
}
|
|
|
|
function isEndOfHTMLComment(word: string) {
|
|
return /--\>$/.test(word);
|
|
}
|
|
|
|
/**
|
|
* Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose
|
|
* child nodes should not be compared - the entire tag should be treated as one token. This
|
|
* is useful for tags where it does not make sense to insert <ins> and <del> tags.
|
|
*
|
|
* @param {string} word The characters of the current token read so far.
|
|
*
|
|
* @return {string|null} The name of the atomic tag if the word will be an atomic tag,
|
|
* null otherwise
|
|
*/
|
|
function isStartOfAtomicTag(word: string): string | null {
|
|
// Note: "math" was removed from this list on Outline fork to support math-display, math-inline nodes
|
|
const result = /^<(iframe|object|svg|script)/.exec(word);
|
|
return result && result[1];
|
|
}
|
|
|
|
/**
|
|
* Checks if the current word is the end of an atomic tag (i.e. it has all the characters,
|
|
* except for the end bracket of the closing tag, such as '<iframe></iframe').
|
|
*
|
|
* @param {string} word The characters of the current token read so far.
|
|
* @param {string} tag The ending tag to look for.
|
|
*
|
|
* @return {boolean} True if the word is now a complete token (including the end tag),
|
|
* false otherwise.
|
|
*/
|
|
function isEndOfAtomicTag(word: string, tag: string): boolean {
|
|
return word.substring(word.length - tag.length - 2) === "</" + tag;
|
|
}
|
|
|
|
/**
|
|
* Checks if a tag is a void tag.
|
|
*
|
|
* @param {string} token The token to check.
|
|
*
|
|
* @return {boolean} True if the token is a void tag, false otherwise.
|
|
*/
|
|
function isVoidTag(token: string): boolean {
|
|
return /^\s*<[^>]+\/>\s*$/.test(token);
|
|
}
|
|
|
|
/**
|
|
* Checks if a token can be wrapped inside a tag.
|
|
*
|
|
* @param {string} token The token to check.
|
|
*
|
|
* @return {boolean} True if the token can be wrapped inside a tag, false otherwise.
|
|
*/
|
|
function isWrappable(token: string): boolean {
|
|
return isntTag(token) || !!isStartOfAtomicTag(token) || isVoidTag(token);
|
|
}
|
|
|
|
/**
|
|
* Creates a token that holds a string and key representation. The key is used for diffing
|
|
* comparisons and the string is used to recompose the document after the diff is complete.
|
|
*
|
|
* @param {string} currentWord The section of the document to create a token for.
|
|
*
|
|
* @return {Token} A token object with a string and key property.
|
|
*/
|
|
function createToken(currentWord: string): Token {
|
|
return {
|
|
string: currentWord,
|
|
key: getKeyForToken(currentWord),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* A Match stores the information of a matching block. A matching block is a list of
|
|
* consecutive tokens that appear in both the before and after lists of tokens.
|
|
*
|
|
* @param {number} startInBefore The index of the first token in the list of before tokens.
|
|
* @param {number} startInAfter The index of the first token in the list of after tokens.
|
|
* @param {number} length The number of consecutive matching tokens in this block.
|
|
* @param {Segment} segment The segment where the match was found.
|
|
*/
|
|
function Match(
|
|
startInBefore: number,
|
|
startInAfter: number,
|
|
length: number,
|
|
segment: Segment
|
|
) {
|
|
this.segment = segment;
|
|
this.length = length;
|
|
|
|
this.startInBefore = startInBefore + segment.beforeIndex;
|
|
this.startInAfter = startInAfter + segment.afterIndex;
|
|
this.endInBefore = this.startInBefore + this.length - 1;
|
|
this.endInAfter = this.startInAfter + this.length - 1;
|
|
|
|
this.segmentStartInBefore = startInBefore;
|
|
this.segmentStartInAfter = startInAfter;
|
|
this.segmentEndInBefore = this.segmentStartInBefore + this.length - 1;
|
|
this.segmentEndInAfter = this.segmentStartInAfter + this.length - 1;
|
|
}
|
|
|
|
/**
|
|
* Tokenizes a string of HTML.
|
|
*
|
|
* @param {string} html The string to tokenize.
|
|
*
|
|
* @return {Array} The list of tokens.
|
|
*/
|
|
function htmlToTokens(html: string): Token[] {
|
|
let mode = "char";
|
|
let currentWord = "";
|
|
let currentAtomicTag = "";
|
|
const words = [];
|
|
for (let i = 0; i < html.length; i++) {
|
|
const char = html[i];
|
|
switch (mode) {
|
|
case "tag": {
|
|
const atomicTag = isStartOfAtomicTag(currentWord);
|
|
if (atomicTag) {
|
|
mode = "atomic_tag";
|
|
currentAtomicTag = atomicTag;
|
|
currentWord += char;
|
|
} else if (isStartofHTMLComment(currentWord)) {
|
|
mode = "html_comment";
|
|
currentWord += char;
|
|
} else if (isEndOfTag(char)) {
|
|
currentWord += ">";
|
|
words.push(createToken(currentWord));
|
|
currentWord = "";
|
|
if (isWhitespace(char)) {
|
|
mode = "whitespace";
|
|
} else {
|
|
mode = "char";
|
|
}
|
|
} else {
|
|
currentWord += char;
|
|
}
|
|
break;
|
|
}
|
|
case "atomic_tag":
|
|
if (
|
|
isEndOfTag(char) &&
|
|
isEndOfAtomicTag(currentWord, currentAtomicTag)
|
|
) {
|
|
currentWord += ">";
|
|
words.push(createToken(currentWord));
|
|
currentWord = "";
|
|
currentAtomicTag = "";
|
|
mode = "char";
|
|
} else {
|
|
currentWord += char;
|
|
}
|
|
break;
|
|
case "html_comment":
|
|
currentWord += char;
|
|
if (isEndOfHTMLComment(currentWord)) {
|
|
currentWord = "";
|
|
mode = "char";
|
|
}
|
|
break;
|
|
case "char":
|
|
if (isStartOfTag(char)) {
|
|
if (currentWord) {
|
|
words.push(createToken(currentWord));
|
|
}
|
|
currentWord = "<";
|
|
mode = "tag";
|
|
} else if (/\s/.test(char)) {
|
|
if (currentWord) {
|
|
words.push(createToken(currentWord));
|
|
}
|
|
currentWord = char;
|
|
mode = "whitespace";
|
|
} else if (/[\w\d\#@]/.test(char)) {
|
|
currentWord += char;
|
|
} else if (/&/.test(char)) {
|
|
if (currentWord) {
|
|
words.push(createToken(currentWord));
|
|
}
|
|
currentWord = char;
|
|
} else {
|
|
currentWord += char;
|
|
words.push(createToken(currentWord));
|
|
currentWord = "";
|
|
}
|
|
break;
|
|
case "whitespace":
|
|
if (isStartOfTag(char)) {
|
|
if (currentWord) {
|
|
words.push(createToken(currentWord));
|
|
}
|
|
currentWord = "<";
|
|
mode = "tag";
|
|
} else if (isWhitespace(char)) {
|
|
currentWord += char;
|
|
} else {
|
|
if (currentWord) {
|
|
words.push(createToken(currentWord));
|
|
}
|
|
currentWord = char;
|
|
mode = "char";
|
|
}
|
|
break;
|
|
default:
|
|
throw new Error("Unknown mode " + mode);
|
|
}
|
|
}
|
|
if (currentWord) {
|
|
words.push(createToken(currentWord));
|
|
}
|
|
|
|
return words;
|
|
}
|
|
|
|
/**
|
|
* Creates a key that should be used to match tokens. This is useful, for example, if we want
|
|
* to consider two open tag tokens as equal, even if they don't have the same attributes. We
|
|
* use a key instead of overwriting the token because we may want to render the original string
|
|
* without losing the attributes.
|
|
*
|
|
* @param {string} token The token to create the key for.
|
|
*
|
|
* @return {string} The identifying key that should be used to match before and after tokens.
|
|
*/
|
|
function getKeyForToken(token: string): string {
|
|
const tagName = /<([^\s>]+)[\s>]/.exec(token);
|
|
if (tagName) {
|
|
return "<" + tagName[1].toLowerCase() + ">";
|
|
}
|
|
return token && token.replace(/(\s+| | )/g, " ");
|
|
}
|
|
|
|
/**
|
|
* Creates a map from token key to an array of indices of locations of the matching token in
|
|
* the list of all tokens.
|
|
*
|
|
* @param {Array.<Token>} tokens The list of tokens to be mapped.
|
|
*
|
|
* @return {Object} A mapping that can be used to search for tokens.
|
|
*/
|
|
function createMap(tokens: Token[]) {
|
|
return tokens.reduce(function (map, token, index) {
|
|
if (map[token.key]) {
|
|
map[token.key].push(index);
|
|
} else {
|
|
map[token.key] = [index];
|
|
}
|
|
return map;
|
|
}, Object.create(null));
|
|
}
|
|
|
|
/**
|
|
* Compares two match objects to determine if the second match object comes before or after the
|
|
* first match object. Returns -1 if the m2 should come before m1. Returns 1 if m1 should come
|
|
* before m2. If the two matches criss-cross each other, a null is returned.
|
|
*
|
|
* @param {MatchT} m1 The first match object to compare.
|
|
* @param {MatchT} m2 The second match object to compare.
|
|
*
|
|
* @return {number} Returns -1 if the m2 should come before m1. Returns 1 if m1 should come
|
|
* before m2. If the two matches criss-cross each other, 0 is returned.
|
|
*/
|
|
function compareMatches(m1: MatchT, m2: MatchT): number {
|
|
if (m2.endInBefore < m1.startInBefore && m2.endInAfter < m1.startInAfter) {
|
|
return -1;
|
|
} else if (
|
|
m2.startInBefore > m1.endInBefore &&
|
|
m2.startInAfter > m1.endInAfter
|
|
) {
|
|
return 1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* A constructor for a binary search tree used to keep match objects in the proper order as
|
|
* they're found.
|
|
*
|
|
* @constructor
|
|
*/
|
|
function MatchBinarySearchTree() {
|
|
this._root = null;
|
|
}
|
|
|
|
MatchBinarySearchTree.prototype = {
|
|
/**
|
|
* Adds matches to the binary search tree.
|
|
*
|
|
* @param {MatchT} value The match to add to the binary search tree.
|
|
*/
|
|
add(value: MatchT) {
|
|
// Create the node to hold the match value.
|
|
const node = {
|
|
value,
|
|
left: null,
|
|
right: null,
|
|
};
|
|
|
|
let current = this._root;
|
|
if (current) {
|
|
// eslint-disable-next-line no-constant-condition
|
|
while (true) {
|
|
// Determine if the match value should go to the left or right of the current
|
|
// node.
|
|
const position = compareMatches(current.value, value);
|
|
if (position === -1) {
|
|
// The position of the match is to the left of this node.
|
|
if (current.left) {
|
|
current = current.left;
|
|
} else {
|
|
current.left = node;
|
|
break;
|
|
}
|
|
} else if (position === 1) {
|
|
// The position of the match is to the right of this node.
|
|
if (current.right) {
|
|
current = current.right;
|
|
} else {
|
|
current.right = node;
|
|
break;
|
|
}
|
|
} else {
|
|
// If 0 was returned from compareMatches, that means the node cannot
|
|
// be inserted because it overlaps an existing node.
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
// If no nodes exist in the tree, make this the root node.
|
|
this._root = node;
|
|
}
|
|
},
|
|
|
|
/**
|
|
* Converts the binary search tree into an array using an in-order traversal.
|
|
*
|
|
* @return {Array.<MatchT>} An array containing the matches in the binary search tree.
|
|
*/
|
|
toArray(): Array<MatchT> {
|
|
type Node = {
|
|
value: MatchT;
|
|
left: Node;
|
|
right: Node;
|
|
};
|
|
|
|
function inOrder(node: Node, nodes: MatchT[]) {
|
|
if (node) {
|
|
inOrder(node.left, nodes);
|
|
nodes.push(node.value);
|
|
inOrder(node.right, nodes);
|
|
}
|
|
return nodes;
|
|
}
|
|
|
|
return inOrder(this._root, []);
|
|
},
|
|
};
|
|
|
|
/**
|
|
* Finds and returns the best match between the before and after arrays contained in the segment
|
|
* provided.
|
|
*
|
|
* @param {Segment} segment The segment in which to look for a match.
|
|
*
|
|
* @return {MatchT} The best match.
|
|
*/
|
|
function findBestMatch(segment: Segment): MatchT | null {
|
|
const beforeTokens = segment.beforeTokens;
|
|
const afterMap = segment.afterMap;
|
|
let lastSpace = null;
|
|
let bestMatch: MatchT | null = null;
|
|
|
|
// Iterate through the entirety of the beforeTokens to find the best match.
|
|
for (let beforeIndex = 0; beforeIndex < beforeTokens.length; beforeIndex++) {
|
|
let lookBehind = false;
|
|
|
|
// If the current best match is longer than the remaining tokens, we can bail because we
|
|
// won't find a better match.
|
|
const remainingTokens = beforeTokens.length - beforeIndex;
|
|
if (bestMatch && remainingTokens < (bestMatch as MatchT).length) {
|
|
break;
|
|
}
|
|
|
|
// If the current token is whitespace, make a note of it and move on. Trying to start a
|
|
// set of matches with whitespace is not efficient because it's too prevelant in most
|
|
// documents. Instead, if the next token yields a match, we'll see if the whitespace can
|
|
// be included in that match.
|
|
const beforeToken = beforeTokens[beforeIndex];
|
|
if (beforeToken.key === " ") {
|
|
lastSpace = beforeIndex;
|
|
continue;
|
|
}
|
|
|
|
// Check to see if we just skipped a space, if so, we'll ask getFullMatch to look behind
|
|
// by one token to see if it can include the whitespace.
|
|
if (lastSpace === beforeIndex - 1) {
|
|
lookBehind = true;
|
|
}
|
|
|
|
// If the current token is not found in the afterTokens, it won't match and we can move
|
|
// on.
|
|
const afterTokenLocations = afterMap[beforeToken.key];
|
|
if (!afterTokenLocations) {
|
|
continue;
|
|
}
|
|
|
|
// For each instance of the current token in afterTokens, let's see how big of a match
|
|
// we can build.
|
|
afterTokenLocations.forEach(function (afterIndex: number) {
|
|
// getFullMatch will see how far the current token match will go in both
|
|
// beforeTokens and afterTokens.
|
|
const bestMatchLength = bestMatch ? bestMatch.length : 0;
|
|
const match = getFullMatch(
|
|
segment,
|
|
beforeIndex,
|
|
afterIndex,
|
|
bestMatchLength,
|
|
lookBehind
|
|
);
|
|
|
|
// If we got a new best match, we'll save it aside.
|
|
if (match && match.length > bestMatchLength) {
|
|
bestMatch = match;
|
|
}
|
|
});
|
|
}
|
|
|
|
return bestMatch;
|
|
}
|
|
|
|
/**
|
|
* Takes the start of a match, and expands it in the beforeTokens and afterTokens of the
|
|
* current segment as far as it can go.
|
|
*
|
|
* @param {Segment} segment The segment object to search within when expanding the match.
|
|
* @param {number} beforeStart The offset within beforeTokens to start looking.
|
|
* @param {number} afterStart The offset within afterTokens to start looking.
|
|
* @param {number} minLength The minimum length match that must be found.
|
|
* @param {boolean} lookBehind If true, attempt to match a whitespace token just before the
|
|
* beforeStart and afterStart tokens.
|
|
*
|
|
* @return {MatchT} The full match.
|
|
*/
|
|
function getFullMatch(
|
|
segment: Segment,
|
|
beforeStart: number,
|
|
afterStart: number,
|
|
minLength: number,
|
|
lookBehind: boolean
|
|
): MatchT | undefined {
|
|
const beforeTokens = segment.beforeTokens;
|
|
const afterTokens = segment.afterTokens;
|
|
|
|
// If we already have a match that goes to the end of the document, no need to keep looking.
|
|
const minBeforeIndex = beforeStart + minLength;
|
|
const minAfterIndex = afterStart + minLength;
|
|
if (
|
|
minBeforeIndex >= beforeTokens.length ||
|
|
minAfterIndex >= afterTokens.length
|
|
) {
|
|
return;
|
|
}
|
|
|
|
// If a minLength was provided, we can do a quick check to see if the tokens after that
|
|
// length match. If not, we won't be beating the previous best match, and we can bail out
|
|
// early.
|
|
if (minLength) {
|
|
const nextBeforeWord = beforeTokens[minBeforeIndex].key;
|
|
const nextAfterWord = afterTokens[minAfterIndex].key;
|
|
if (nextBeforeWord !== nextAfterWord) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Extend the current match as far foward as it can go, without overflowing beforeTokens or
|
|
// afterTokens.
|
|
let searching = true;
|
|
let currentLength = 1;
|
|
let beforeIndex = beforeStart + currentLength;
|
|
let afterIndex = afterStart + currentLength;
|
|
|
|
while (
|
|
searching &&
|
|
beforeIndex < beforeTokens.length &&
|
|
afterIndex < afterTokens.length
|
|
) {
|
|
const beforeWord = beforeTokens[beforeIndex].key;
|
|
const afterWord = afterTokens[afterIndex].key;
|
|
if (beforeWord === afterWord) {
|
|
currentLength++;
|
|
beforeIndex = beforeStart + currentLength;
|
|
afterIndex = afterStart + currentLength;
|
|
} else {
|
|
searching = false;
|
|
}
|
|
}
|
|
|
|
// If we've been asked to look behind, it's because both beforeTokens and afterTokens may
|
|
// have a whitespace token just behind the current match that was previously ignored. If so,
|
|
// we'll expand the current match to include it.
|
|
if (lookBehind && beforeStart > 0 && afterStart > 0) {
|
|
const prevBeforeKey = beforeTokens[beforeStart - 1].key;
|
|
const prevAfterKey = afterTokens[afterStart - 1].key;
|
|
if (prevBeforeKey === " " && prevAfterKey === " ") {
|
|
beforeStart--;
|
|
afterStart--;
|
|
currentLength++;
|
|
}
|
|
}
|
|
|
|
// @ts-expect-error: New construct any
|
|
return new Match(beforeStart, afterStart, currentLength, segment);
|
|
}
|
|
|
|
/**
|
|
* Creates segment objects from the original document that can be used to restrict the area that
|
|
* findBestMatch and it's helper functions search to increase performance.
|
|
*
|
|
* @param {Array.<Token>} beforeTokens Tokens from the before document.
|
|
* @param {Array.<Token>} afterTokens Tokens from the after document.
|
|
* @param {number} beforeIndex The index within the before document where this segment begins.
|
|
* @param {number} afterIndex The index within the after document where this segment behinds.
|
|
*
|
|
* @return {Segment} The segment object.
|
|
*/
|
|
function createSegment(
|
|
beforeTokens: Token[],
|
|
afterTokens: Token[],
|
|
beforeIndex: number,
|
|
afterIndex: number
|
|
): Segment {
|
|
return {
|
|
beforeTokens,
|
|
afterTokens,
|
|
beforeMap: createMap(beforeTokens),
|
|
afterMap: createMap(afterTokens),
|
|
beforeIndex,
|
|
afterIndex,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Finds all the matching blocks within the given segment in the before and after lists of
|
|
* tokens.
|
|
*
|
|
* @param {Segment} The segment that should be searched for matching blocks.
|
|
*
|
|
* @return {Array.<Match>} The list of matching blocks in this range.
|
|
*/
|
|
function findMatchingBlocks(segment: Segment): Array<MatchT> {
|
|
// Create a binary search tree to hold the matches we find in order.
|
|
// @ts-expect-error: New construct any
|
|
const matches = new MatchBinarySearchTree();
|
|
let match;
|
|
const segments: Segment[] = [segment];
|
|
|
|
// Each time the best match is found in a segment, zero, one or two new segments may be
|
|
// created from the parts of the original segment not included in the match. We will
|
|
// continue to iterate until all segments have been processed.
|
|
while (segments.length) {
|
|
segment = segments.pop() as Segment;
|
|
match = findBestMatch(segment);
|
|
|
|
if (match && match.length) {
|
|
// If there's an unmatched area at the start of the segment, create a new segment
|
|
// from that area and throw it into the segments array to get processed.
|
|
if (match.segmentStartInBefore > 0 && match.segmentStartInAfter > 0) {
|
|
const leftBeforeTokens = segment.beforeTokens.slice(
|
|
0,
|
|
match.segmentStartInBefore
|
|
);
|
|
const leftAfterTokens = segment.afterTokens.slice(
|
|
0,
|
|
match.segmentStartInAfter
|
|
);
|
|
|
|
segments.push(
|
|
createSegment(
|
|
leftBeforeTokens,
|
|
leftAfterTokens,
|
|
segment.beforeIndex,
|
|
segment.afterIndex
|
|
)
|
|
);
|
|
}
|
|
|
|
// If there's an unmatched area at the end of the segment, create a new segment from that
|
|
// area and throw it into the segments array to get processed.
|
|
const rightBeforeTokens = segment.beforeTokens.slice(
|
|
match.segmentEndInBefore + 1
|
|
);
|
|
const rightAfterTokens = segment.afterTokens.slice(
|
|
match.segmentEndInAfter + 1
|
|
);
|
|
const rightBeforeIndex =
|
|
segment.beforeIndex + match.segmentEndInBefore + 1;
|
|
const rightAfterIndex = segment.afterIndex + match.segmentEndInAfter + 1;
|
|
|
|
if (rightBeforeTokens.length && rightAfterTokens.length) {
|
|
segments.push(
|
|
createSegment(
|
|
rightBeforeTokens,
|
|
rightAfterTokens,
|
|
rightBeforeIndex,
|
|
rightAfterIndex
|
|
)
|
|
);
|
|
}
|
|
|
|
matches.add(match);
|
|
}
|
|
}
|
|
|
|
return matches.toArray();
|
|
}
|
|
|
|
/**
|
|
* Gets a list of operations required to transform the before list of tokens into the
|
|
* after list of tokens. An operation describes whether a particular list of consecutive
|
|
* tokens are equal, replaced, inserted, or deleted.
|
|
*
|
|
* @param {Array.<string>} beforeTokens The before list of tokens.
|
|
* @param {Array.<string>} afterTokens The after list of tokens.
|
|
*
|
|
* @return {Array.<Object>} The list of operations to transform the before list of
|
|
* tokens into the after list of tokens, where each operation has the following
|
|
* keys:
|
|
* - {string} action One of {'replace', 'insert', 'delete', 'equal'}.
|
|
* - {number} startInBefore The beginning of the range in the list of before tokens.
|
|
* - {number} endInBefore The end of the range in the list of before tokens.
|
|
* - {number} startInAfter The beginning of the range in the list of after tokens.
|
|
* - {number} endInAfter The end of the range in the list of after tokens.
|
|
*/
|
|
function calculateOperations(
|
|
beforeTokens: Token[],
|
|
afterTokens: Token[]
|
|
): Array<Operation> {
|
|
if (!beforeTokens) {
|
|
throw new Error("Missing beforeTokens");
|
|
}
|
|
if (!afterTokens) {
|
|
throw new Error("Missing afterTokens");
|
|
}
|
|
|
|
let positionInBefore = 0;
|
|
let positionInAfter = 0;
|
|
const operations: Operation[] = [];
|
|
const segment = createSegment(beforeTokens, afterTokens, 0, 0);
|
|
const matches = findMatchingBlocks(segment);
|
|
|
|
// @ts-expect-error New construct any
|
|
matches.push(new Match(beforeTokens.length, afterTokens.length, 0, segment));
|
|
|
|
for (let index = 0; index < matches.length; index++) {
|
|
const match = matches[index];
|
|
let actionUpToMatchPositions: OperationType = "none";
|
|
if (positionInBefore === match.startInBefore) {
|
|
if (positionInAfter !== match.startInAfter) {
|
|
actionUpToMatchPositions = "insert";
|
|
}
|
|
} else {
|
|
actionUpToMatchPositions = "delete";
|
|
if (positionInAfter !== match.startInAfter) {
|
|
actionUpToMatchPositions = "replace";
|
|
}
|
|
}
|
|
if (actionUpToMatchPositions !== "none") {
|
|
operations.push({
|
|
action: actionUpToMatchPositions,
|
|
startInBefore: positionInBefore,
|
|
endInBefore:
|
|
actionUpToMatchPositions !== "insert"
|
|
? match.startInBefore - 1
|
|
: null,
|
|
startInAfter: positionInAfter,
|
|
endInAfter:
|
|
actionUpToMatchPositions !== "delete" ? match.startInAfter - 1 : null,
|
|
});
|
|
}
|
|
if (match.length !== 0) {
|
|
operations.push({
|
|
action: "equal",
|
|
startInBefore: match.startInBefore,
|
|
endInBefore: match.endInBefore,
|
|
startInAfter: match.startInAfter,
|
|
endInAfter: match.endInAfter,
|
|
});
|
|
}
|
|
positionInBefore = match.endInBefore + 1;
|
|
positionInAfter = match.endInAfter + 1;
|
|
}
|
|
|
|
const postProcessed = [];
|
|
let lastOp = { action: "none" } as Operation;
|
|
|
|
function isSingleWhitespace(op: Operation) {
|
|
if (op.action !== "equal") {
|
|
return false;
|
|
}
|
|
if ((op.endInBefore ?? 0) - op.startInBefore !== 0) {
|
|
return false;
|
|
}
|
|
return /^\s$/.test(
|
|
// @ts-expect-error Not sure why the code slices here
|
|
beforeTokens.slice(op.startInBefore, op.endInBefore + 1)
|
|
);
|
|
}
|
|
|
|
for (let i = 0; i < operations.length; i++) {
|
|
const op = operations[i];
|
|
|
|
if (
|
|
(isSingleWhitespace(op) && lastOp.action === "replace") ||
|
|
(op.action === "replace" && lastOp.action === "replace")
|
|
) {
|
|
lastOp.endInBefore = op.endInBefore;
|
|
lastOp.endInAfter = op.endInAfter;
|
|
} else {
|
|
postProcessed.push(op as Operation);
|
|
lastOp = op;
|
|
}
|
|
}
|
|
return postProcessed;
|
|
}
|
|
|
|
/**
|
|
* A TokenWrapper provides a utility for grouping segments of tokens based on whether they're
|
|
* wrappable or not. A tag is considered wrappable if it is closed within the given set of
|
|
* tokens. For example, given the following tokens:
|
|
*
|
|
* ['</b>', 'this', ' ', 'is', ' ', 'a', ' ', '<b>', 'test', '</b>', '!']
|
|
*
|
|
* The first '</b>' is not considered wrappable since the tag is not fully contained within the
|
|
* array of tokens. The '<b>', 'test', and '</b>' would be a part of the same wrappable segment
|
|
* since the entire bold tag is within the set of tokens.
|
|
*
|
|
* TokenWrapper has a method 'combine' which allows walking over the segments to wrap them in
|
|
* tags.
|
|
*/
|
|
function TokenWrapper(tokens: any) {
|
|
this.tokens = tokens;
|
|
this.notes = tokens.reduce(
|
|
function (data: any, token: any, index: number) {
|
|
data.notes.push({
|
|
isWrappable: isWrappable(token),
|
|
insertedTag: false,
|
|
});
|
|
|
|
const tag = !isVoidTag(token) && isTag(token);
|
|
const lastEntry = data.tagStack[data.tagStack.length - 1];
|
|
if (tag) {
|
|
if (lastEntry && "/" + lastEntry.tag === tag) {
|
|
data.notes[lastEntry.position].insertedTag = true;
|
|
data.tagStack.pop();
|
|
} else {
|
|
data.tagStack.push({
|
|
tag,
|
|
position: index,
|
|
});
|
|
}
|
|
}
|
|
return data;
|
|
},
|
|
{ notes: [], tagStack: [] }
|
|
).notes;
|
|
}
|
|
|
|
/**
|
|
* Wraps the contained tokens in tags based on output given by a map function. Each segment of
|
|
* tokens will be visited. A segment is a continuous run of either all wrappable
|
|
* tokens or unwrappable tokens. The given map function will be called with each segment of
|
|
* tokens and the resulting strings will be combined to form the wrapped HTML.
|
|
*
|
|
* @param {function(boolean, Array.<string>)} mapFn A function called with an array of tokens
|
|
* and whether those tokens are wrappable or not. The result should be a string.
|
|
*/
|
|
TokenWrapper.prototype.combine = function (
|
|
mapFn: (wrappable: boolean, tokens: Token[]) => void,
|
|
tagFn: (tokens: Token[]) => void
|
|
) {
|
|
const notes = this.notes;
|
|
const tokens = this.tokens.slice();
|
|
const segments = tokens.reduce(
|
|
function (data: any, _token: Token, index: number) {
|
|
if (notes[index].insertedTag) {
|
|
tokens[index] = tagFn(tokens[index]);
|
|
}
|
|
if (data.status === null) {
|
|
data.status = notes[index].isWrappable;
|
|
}
|
|
const status = notes[index].isWrappable;
|
|
if (status !== data.status) {
|
|
data.list.push({
|
|
isWrappable: data.status,
|
|
tokens: tokens.slice(data.lastIndex, index),
|
|
});
|
|
data.lastIndex = index;
|
|
data.status = status;
|
|
}
|
|
if (index === tokens.length - 1) {
|
|
data.list.push({
|
|
isWrappable: data.status,
|
|
tokens: tokens.slice(data.lastIndex, index + 1),
|
|
});
|
|
}
|
|
return data;
|
|
},
|
|
{ list: [], status: null, lastIndex: 0 }
|
|
).list;
|
|
|
|
return segments.map(mapFn).join("");
|
|
};
|
|
|
|
/**
|
|
* Wraps and concatenates a list of tokens with a tag. Does not wrap tag tokens,
|
|
* unless they are wrappable (i.e. void and atomic tags).
|
|
*
|
|
* @param {string} tag The tag name of the wrapper tags.
|
|
* @param {Array.<string>} content The list of tokens to wrap.
|
|
* @param {string} dataPrefix (Optional) The prefix to use in data attributes.
|
|
* @param {string} className (Optional) The class name to include in the wrapper tag.
|
|
*/
|
|
function wrap(
|
|
tag: string,
|
|
content: Array<string>,
|
|
opIndex: number,
|
|
dataPrefix: string,
|
|
className: string
|
|
) {
|
|
// @ts-expect-error New constructor any
|
|
const wrapper = new TokenWrapper(content);
|
|
dataPrefix = dataPrefix ? dataPrefix + "-" : "";
|
|
let attrs = " data-" + dataPrefix + 'operation-index="' + opIndex + '"';
|
|
if (className) {
|
|
attrs += ' class="' + className + '"';
|
|
}
|
|
|
|
return wrapper.combine(
|
|
function (segment: any) {
|
|
if (segment.isWrappable) {
|
|
const val = segment.tokens.join("");
|
|
if (val.trim()) {
|
|
return "<" + tag + attrs + ">" + val + "</" + tag + ">";
|
|
}
|
|
} else {
|
|
return segment.tokens.join("");
|
|
}
|
|
return "";
|
|
},
|
|
function (openingTag: string) {
|
|
let dataAttrs = ' data-diff-node="' + tag + '"';
|
|
dataAttrs += " data-" + dataPrefix + 'operation-index="' + opIndex + '"';
|
|
|
|
return openingTag.replace(/>\s*$/, dataAttrs + "$&");
|
|
}
|
|
);
|
|
}
|
|
|
|
/**
|
|
* OPS.equal/insert/delete/replace are functions that render an operation into
|
|
* HTML content.
|
|
*
|
|
* @param {Operation} op The operation that applies to a particular list of tokens.
|
|
* @param {Array.<Token>} beforeTokens The before list of tokens.
|
|
* @param {Array.<Token>} afterTokens The after list of tokens.
|
|
* @param {number} opIndex The index into the list of operations that identifies the change to
|
|
* be rendered. This is used to mark wrapped HTML as part of the same operation.
|
|
* @param {string} dataPrefix (Optional) The prefix to use in data attributes.
|
|
* @param {string} className (Optional) The class name to include in the wrapper tag.
|
|
*
|
|
* @return {string} The rendering of that operation.
|
|
*/
|
|
const OPS = {
|
|
equal(op: Operation, _beforeTokens: Token[], afterTokens: Token[]) {
|
|
const tokens = afterTokens.slice(
|
|
op.startInAfter ?? 0,
|
|
(op.endInAfter ?? 0) + 1
|
|
);
|
|
return tokens.reduce(function (prev, curr) {
|
|
return prev + curr.string;
|
|
}, "");
|
|
},
|
|
insert(
|
|
op: Operation,
|
|
_beforeTokens: Token[],
|
|
afterTokens: Token[],
|
|
opIndex: number,
|
|
dataPrefix: string,
|
|
className: string
|
|
) {
|
|
const tokens = afterTokens.slice(
|
|
op.startInAfter ?? 0,
|
|
(op.endInAfter ?? 0) + 1
|
|
);
|
|
const val = tokens.map(function (token) {
|
|
return token.string;
|
|
});
|
|
return wrap("ins", val, opIndex, dataPrefix, className);
|
|
},
|
|
delete(
|
|
op: Operation,
|
|
beforeTokens: Token[],
|
|
_afterTokens: Token[],
|
|
opIndex: number,
|
|
dataPrefix: string,
|
|
className: string
|
|
) {
|
|
const tokens = beforeTokens.slice(
|
|
op.startInBefore,
|
|
(op.endInBefore ?? 0) + 1
|
|
);
|
|
const val = tokens.map(function (token) {
|
|
return token.string;
|
|
});
|
|
return wrap("del", val, opIndex, dataPrefix, className);
|
|
},
|
|
replace(...rest: any[]) {
|
|
return OPS["delete"].apply(null, rest) + OPS["insert"].apply(null, rest);
|
|
},
|
|
};
|
|
|
|
/**
|
|
* Renders a list of operations into HTML content. The result is the combined version
|
|
* of the before and after tokens with the differences wrapped in tags.
|
|
*
|
|
* @param {Array.<Token>} beforeTokens The before list of tokens.
|
|
* @param {Array.<Token>} afterTokens The after list of tokens.
|
|
* @param {Array.<Operation>} operations The list of operations to transform the before
|
|
* list of tokens into the after list of tokens;
|
|
* @param {string} dataPrefix (Optional) The prefix to use in data attributes.
|
|
* @param {string} className (Optional) The class name to include in the wrapper tag.
|
|
*
|
|
* @return {string} The rendering of the list of operations.
|
|
*/
|
|
function renderOperations(
|
|
beforeTokens: Token[],
|
|
afterTokens: Token[],
|
|
operations: Operation[],
|
|
dataPrefix?: string,
|
|
className?: string
|
|
): string {
|
|
return operations.reduce(function (rendering, op, index) {
|
|
return (
|
|
rendering +
|
|
// @ts-expect-error TODO
|
|
OPS[op.action]?.(
|
|
op,
|
|
beforeTokens,
|
|
afterTokens,
|
|
index,
|
|
dataPrefix,
|
|
className
|
|
)
|
|
);
|
|
}, "");
|
|
}
|
|
|
|
/**
|
|
* Compares two pieces of HTML content and returns the combined content with differences
|
|
* wrapped in <ins> and <del> tags.
|
|
*
|
|
* @param {string} before The HTML content before the changes.
|
|
* @param {string} after The HTML content after the changes.
|
|
* @param {string} className (Optional) The class attribute to include in <ins> and <del> tags.
|
|
* @param {string} dataPrefix (Optional) The data prefix to use for data attributes. The
|
|
* operation index data attribute will be named `data-${dataPrefix-}operation-index`.
|
|
*
|
|
* @return {string} The combined HTML content with differences wrapped in <ins> and <del> tags.
|
|
*/
|
|
function diff(
|
|
before: string,
|
|
after: string,
|
|
className?: string,
|
|
dataPrefix?: string
|
|
): string {
|
|
if (before === after) {
|
|
return before;
|
|
}
|
|
|
|
const beforeTokens = htmlToTokens(before);
|
|
const afterTokens = htmlToTokens(after);
|
|
const ops = calculateOperations(beforeTokens, afterTokens);
|
|
return renderOperations(
|
|
beforeTokens,
|
|
afterTokens,
|
|
ops,
|
|
dataPrefix,
|
|
className
|
|
);
|
|
}
|
|
|
|
export default diff;
|