* @fileoverview Interface used to extract visible text on the page, add extra
* at the ends, and pass it on to the a consumer.
*/
import type {TextWithSymbolIndex} from '//ios/web/annotations/resources/text_dom_utils.js';
import {nextLeaf, previousLeaf} from '//ios/web/annotations/resources/text_dom_utils.js';
import type {TextNodeVisitor} from '//ios/web/annotations/resources/text_intersection_observer.js';
const SECTION_BREAK = ' ‡ ';
const EXTRA_CHARACTERS_AT_END = 128;
const KNOWN_INLINE_ELEMENTS: Set<string> = new Set([
'A',
'ABBR',
'B',
'CITE',
'CODE',
'I',
'DFN',
'EM',
'MARK',
'SMALL',
'SPAN',
'STRONG',
'SUB',
'SUP',
'VAR',
]);
export class TextSection {
private sourceTextNode: WeakRef<TextWithSymbolIndex>;
constructor(textNode: TextWithSymbolIndex, public index: number) {
this.sourceTextNode = new WeakRef<TextWithSymbolIndex>(textNode);
}
get textNode(): TextWithSymbolIndex|null {
return this.sourceTextNode.deref() || null;
}
}
export interface TextChunkConsumer {
(chunk: TextChunk): void;
}
export class TextChunk {
text: string = '';
sections: TextSection[] = [];
constructor(
public firstNodeOffset: number, public visibleStart: number,
public visibleEnd: number) {}
add(sections: TextSection[], text: string): void {
const offset = this.text.length;
for (const section of sections) {
section.index += offset;
}
this.text += text;
this.sections.push(...sections);
}
}
export class TextExtractor implements TextNodeVisitor {
constructor(
private consumer: TextChunkConsumer,
private extraCharactersAtEnd = EXTRA_CHARACTERS_AT_END,
private sectionBreak = SECTION_BREAK) {}
private parts: string[] = [];
private sections: TextSection[] = [];
private broken = true;
private index = 0;
spaced = true;
begin(): void {
this.parts = [];
this.sections = [];
this.broken = true;
this.spaced = true;
this.index = 0;
}
visibleTextNode(textNode: Text): void {
if (textNode.textContent!.trim()) {
this.parts.push(textNode.textContent!);
this.sections.push(new TextSection(textNode, this.index));
this.index += textNode.textContent!.length;
this.broken = false;
this.spaced = false;
} else {
this.addSpaceIfNeeded();
}
}
enterVisibleNode(node: Node): void {
if (node instanceof Element && !KNOWN_INLINE_ELEMENTS.has(node.nodeName)) {
this.addSpaceIfNeeded();
}
}
leaveVisibleNode(node: Node): void {
if (node instanceof Element && !KNOWN_INLINE_ELEMENTS.has(node.nodeName)) {
this.addSpaceIfNeeded();
}
}
invisibleNode(node: Node): void {
if (node.nodeType === Node.COMMENT_NODE) {
} else if (
node.nodeType === Node.TEXT_NODE &&
(!node.textContent || !node.textContent.trim())) {
this.addSpaceIfNeeded();
} else if (!this.broken) {
this.parts.push(this.sectionBreak);
this.index += this.sectionBreak.length;
this.broken = true;
this.spaced = false;
}
}
end(): void {
if (this.sections.length === 0) {
return;
}
const firstNode: Node = this.sections[0]!.textNode!;
const [firstNodeOffset, prefixText, prefixSections] =
this.extractPrefix(firstNode);
const lastNode: Node = this.sections[this.sections.length - 1]!.textNode!;
const [postfixText, postfixSections] =
this.extractPostfix(lastNode, this.spaced);
const text = ''.concat(...this.parts);
const chunk = new TextChunk(
firstNodeOffset, prefixText.length, prefixText.length + text.length);
chunk.add(prefixSections, prefixText);
chunk.add(this.sections, text);
chunk.add(postfixSections, postfixText);
this.consumer(chunk);
}
private addSpaceIfNeeded() {
if (!this.spaced) {
this.parts.push(' ');
this.index++;
this.spaced = true;
}
}
private extractPrefix(beforeNode: Node): [number, string, TextSection[]] {
let sections: TextSection[] = [];
let parts: string[] = [' '];
let index = this.extraCharactersAtEnd - 1;
let offset = 0;
let spaced = true;
let node: Node|null = previousLeaf(beforeNode, true);
while (node && index > 0) {
if (node.nodeType === Node.TEXT_NODE && node.textContent &&
node.textContent.trim()) {
const textLength = node.textContent.length;
const minLength = Math.min(index, textLength);
offset = textLength - minLength;
parts.push(node.textContent.substring(offset));
sections.push(new TextSection(node as Text, index - minLength));
index -= minLength;
spaced = false;
} else if (!spaced) {
parts.push(' ');
index--;
spaced = true;
}
node = previousLeaf(node, true);
}
if (sections.length > 0) {
sections = sections.reverse();
parts = parts.reverse();
const text = ''.concat(...parts);
if (index > 0) {
for (const section of sections) {
section.index -= index;
}
}
return [offset, text, sections];
}
return [0, '', []];
}
private extractPostfix(afterNode: Node, alreadySpaced: boolean):
[string, TextSection[]] {
const sections: TextSection[] = [];
const parts: string[] = [];
let index = 0;
let spaced = alreadySpaced;
if (!alreadySpaced) {
parts.push(' ');
index++;
spaced = true;
}
const maxChars = alreadySpaced ? this.extraCharactersAtEnd :
this.extraCharactersAtEnd - 1;
let node: Node|null = nextLeaf(afterNode, true);
while (node && index < maxChars) {
if (node.nodeType === Node.TEXT_NODE && node.textContent &&
node.textContent.trim()) {
const textLength = node.textContent.length;
const minLength = Math.min(maxChars - index, textLength);
parts.push(node.textContent.substring(0, minLength));
sections.push(new TextSection(node as Text, index));
index += minLength;
spaced = false;
} else if (!spaced) {
parts.push(' ');
index++;
spaced = true;
}
node = nextLeaf(node, true);
}
if (sections.length > 0) {
const text = ''.concat(...parts);
return [text, sections];
}
return ['', []];
}
}