/** * Copyright (c) Microsoft Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { SelectorEngine, SelectorType, SelectorRoot } from './selectorEngine'; type Token = { combinator: '' | '>' | '~' | '^', index?: number, text?: string, css?: string, }; function tokenize(selector: string): Token[] | number { const tokens: Token[] = []; let pos = 0; const skipWhitespace = () => { while (pos < selector.length && selector[pos] === ' ') pos++; }; while (pos < selector.length) { skipWhitespace(); if (pos === selector.length) break; if (!tokens.length && '^>~'.includes(selector[pos])) return pos; const token: Token = { combinator: '' }; if (selector[pos] === '^') { token.combinator = '^'; tokens.push(token); pos++; continue; } if (selector[pos] === '>') { token.combinator = '>'; pos++; skipWhitespace(); if (pos === selector.length) return pos; } else if (selector[pos] === '~') { token.combinator = '~'; pos++; skipWhitespace(); if (pos === selector.length) return pos; } let text = ''; let end = pos; let stringQuote: string | undefined; const isText = '`"\''.includes(selector[pos]); while (end < selector.length) { if (stringQuote) { if (selector[end] === '\\' && end + 1 < selector.length) { if (!isText) text += selector[end]; text += selector[end + 1]; end += 2; } else if (selector[end] === stringQuote) { text += selector[end++]; stringQuote = undefined; if (isText) break; } else { text += selector[end++]; } } else if (' >~^#'.includes(selector[end])) { break; } else if ('`"\''.includes(selector[end])) { stringQuote = selector[end]; text += selector[end++]; } else { text += selector[end++]; } } if (stringQuote) return end; if (isText) token.text = JSON.stringify(text.substring(1, text.length - 1)); else token.css = text; pos = end; if (pos < selector.length && selector[pos] === '#') { pos++; let end = pos; while (end < selector.length && selector[end] >= '0' && selector[end] <= '9') end++; if (end === pos) return pos; const num = Number(selector.substring(pos, end)); if (isNaN(num)) return pos; token.index = num; pos = end; } tokens.push(token); } return tokens; } function pathFromRoot(root: SelectorRoot, targetElement: Element): (Element | SelectorRoot)[] { let target: Element | SelectorRoot = targetElement; const path: (Element | SelectorRoot)[] = [target]; while (target !== root) { if (!target.parentNode || target.parentNode.nodeType !== 1 /* Node.ELEMENT_NODE */ && target.parentNode.nodeType !== 11 /* Node.DOCUMENT_FRAGMENT_NODE */) throw new Error('Target does not belong to the root subtree'); target = target.parentNode as (Element | SelectorRoot); path.push(target); } path.reverse(); return path; } // This is a map from a list element (parent) to a number of contained lists (immediate children). // // Example: //
// // // //
//
//
//
//
// // Here we might have the following: // div -> [[span, span, span], [div, div, div]] // span -> [[img, img]] type ListsMap = Map; function detectLists(root: SelectorRoot, shouldConsider: (e: Element | SelectorRoot) => boolean, getBox: (e: Element) => ClientRect): ListsMap { const lists: ListsMap = new Map(); const add = (map: Map, element: Element, key: string): void => { let list = map.get(key); if (!list) { list = []; map.set(key, list); } list.push(element); }; const mark = (parent: Element | SelectorRoot, map: Map, used: Set): void => { for (let list of map.values()) { list = list.filter(item => !used.has(item)); if (list.length < 2) continue; let collection = lists.get(parent); if (!collection) { collection = []; lists.set(parent, collection); } collection.push(list); list.forEach(item => used.add(item)); } }; // hashes list: s, vh, v, h const kHashes = 4; const visit = (element: Element | SelectorRoot, produceHashes: boolean): { size: number, hashes?: string[] } => { const consider = shouldConsider(element); let size = 1; let maps: Map[] | undefined; if (consider) maps = new Array(kHashes).fill(0).map(_ => new Map()); let structure: string[] | undefined; if (produceHashes) structure = [element.nodeName]; for (let child = element.firstElementChild; child; child = child.nextElementSibling) { const childResult = visit(child, consider); size += childResult.size; if (consider) { for (let i = 0; i < childResult.hashes!.length; i++) { if (childResult.hashes![i]) add(maps![i], child, childResult.hashes![i]); } } if (structure) structure.push(child.nodeName); } if (consider) { const used = new Set(); maps!.forEach(map => mark(element, map, used)); } let hashes: string[] | undefined; if (produceHashes) { const box = getBox(element as Element); hashes = []; hashes.push((structure!.length >= 4) || (size >= 10) ? structure!.join('') : ''); hashes.push(`${element.nodeName},${(size / 3) | 0},${box.height | 0},${box.width | 0}`); if (size <= 5) hashes.push(`${element.nodeName},${(size / 3) | 0},${box.width | 0},${box.left | 0}`); else hashes.push(`${element.nodeName},${(size / 3) | 0},${box.width | 0},${box.left | 0},${2 * Math.log(box.height) | 0}`); if (size <= 5) hashes.push(`${element.nodeName},${(size / 3) | 0},${box.height | 0},${box.top | 0}`); else hashes.push(`${element.nodeName},${(size / 3) | 0},${box.height | 0},${box.top | 0},${2 * Math.log(box.width) | 0}`); } return { size, hashes }; }; visit(root, false); return lists; } type Step = { token: Token; // Element we point at. element: Element | SelectorRoot; // Distance between element and (lca between target and element). depth: number; // One step score. score: number; // Total path score. totalScore: number; previous?: Step; // Repeat number for ^ steps.s repeat?: number; }; type Options = { genericTagScore: number, textScore?: number, imgAltScore?: number, ariaLabelScore?: number, detectLists?: boolean, avoidShortText?: boolean, usePlaceholders?: boolean, debug?: boolean }; const defaultOptions: Options = { genericTagScore: 10, textScore: 1, imgAltScore: 2, ariaLabelScore: 2, detectLists: true, avoidShortText: false, usePlaceholders: true, debug: false, }; type CueType = 'text' | 'tag' | 'imgAlt' | 'ariaLabel'; type Cue = { type: CueType, score: number, elements: Element[], }; type CueMap = Map; type ElementMetrics = { box: ClientRect, style: CSSStyleDeclaration, fontMetric: number, }; type Lca = { lcaDepth: number; lca: Element | SelectorRoot; anchor: Element | SelectorRoot | undefined; depth: number; // Distance to lca. }; type PathCue = { type: CueType, score: number, elements: Element[][], anchorCount: Map, }; type PreprocessResult = { pathCues: Map, lcaMap: Map, }; type ListIndex = Map; function parentOrRoot(element: Element | SelectorRoot): Element | SelectorRoot | null { return element.parentNode as Element | SelectorRoot; } class Engine { private _cues = new Map(); private _metrics = new Map(); readonly options: Options; constructor(options: Options = defaultOptions) { this.options = options; } query(root: SelectorRoot, selector: string, all: boolean): Element[] { const tokens = tokenize(selector); if (typeof tokens === 'number') throw new Error('Cannot parse selector at position ' + tokens); if (!tokens.length) throw new Error('Empty selector'); if (!this._cues.has(root)) { const cueMap: CueMap = new Map(); const pathCues = this._preprocess(root, [root], Infinity).pathCues; for (const [text, cue] of pathCues) { cueMap.set(text, { type: cue.type, score: cue.score, elements: cue.elements[0] }); } this._cues.set(root, cueMap); } // Map from the element to the boundary used. We never go outside the boundary when doing '~'. let currentStep = new Map(); currentStep.set(root, root); for (const token of tokens) { const nextStep = new Map(); for (let [element, boundary] of currentStep) { let next: (Element | SelectorRoot)[] = []; if (token.combinator === '^') { if (element === boundary) { next = []; } else { const parent = parentOrRoot(element); next = parent ? [parent] : []; } } else if (token.combinator === '>') { boundary = element; next = this._matchChildren(element, token, all); } else if (token.combinator === '') { boundary = element; next = this._matchSubtree(element, token, all); } else if (token.combinator === '~') { while (true) { next = this._matchSubtree(element, token, all); if (next.length) { // Further '~' / '^' will not go outside of this boundary, which is // a container with both the cue and the target elements inside. boundary = element; break; } if (element === boundary) break; element = parentOrRoot(element)!; } } for (const nextElement of next) { if (!nextStep.has(nextElement)) nextStep.set(nextElement, boundary); } } currentStep = nextStep; } return Array.from(currentStep.keys()).filter(e => e.nodeType === 1 /* Node.ELEMENT_NODE */) as Element[]; } create(root: SelectorRoot, target: Element, type: SelectorType): string { const path = pathFromRoot(root, target); const maxCueCount = type === 'notext' ? 50 : 10; const { pathCues, lcaMap } = this._preprocess(root, path, maxCueCount); const lists: ListIndex | undefined = this.options.detectLists ? this._buildLists(root, path) : undefined; const queue: Map[] = path.map(_ => new Map()); const startStep: Step = { token: { combinator: '' }, element: root, depth: 0, score: 0, totalScore: 0 }; for (let stepDepth = -1; stepDepth < path.length; stepDepth++) { const stepsMap = stepDepth === -1 ? new Map([[undefined, startStep]]) : queue[stepDepth]; const ancestorDepth = stepDepth === -1 ? 0 : stepDepth; for (const [text, cue] of pathCues) { const elements = cue.elements[ancestorDepth]; for (let index = 0; index < elements.length; index++) { const element = elements[index]; const lca = lcaMap.get(element)!; const lcaDepth = lca.lcaDepth; // Always go deeper in the tree. if (lcaDepth <= stepDepth) continue; // 'notext' - do not use elements from the target's subtree. if (type === 'notext' && lcaDepth === path.length - 1 && lca.depth > 0) continue; // 'notext' - do not use target's own text. if (type === 'notext' && lcaDepth === path.length - 1 && !lca.depth && cue.type !== 'tag') continue; const targetAnchor = path[lcaDepth + 1]; if (lists && lca.anchor && targetAnchor && lca.anchor !== targetAnchor) { const oldList = lists.get(lca.anchor); // Do not use cues from sibling list items (lca.anchor and targetAnchor). if (oldList && oldList === lists.get(targetAnchor)) continue; } if (cue.type !== 'tag' && !this._isVisible(element)) continue; const distanceToTarget = path.length - stepDepth; // Short text can be used more effectively in a smaller scope. let shortTextScore = 0; if (this.options.avoidShortText && cue.type === 'text') shortTextScore = Math.max(0, distanceToTarget - 2 * (text.length - 2)); const score = (cue.score + shortTextScore) * ( // Unique cues are heavily favored. 1 * (index + elements.length * 1000) + // Larger text is preferred. 5 * (cue.type === 'text' ? this._elementMetrics(element).fontMetric : 1) + // The closer to the target, the better. 1 * lca.depth ); for (const [anchor, step] of stepsMap) { // This ensures uniqueness when resolving the selector. if (anchor && (cue.anchorCount.get(anchor) || 0) > index) continue; let newStep: Step = { token: { combinator: stepDepth === -1 ? '' : '~', text: cue.type === 'text' ? text : undefined, css: cue.type === 'text' ? undefined : text, index: index || undefined, }, previous: step, depth: lca.depth, element, score, totalScore: step.totalScore + score }; let nextStep = queue[lcaDepth].get(lca.anchor); if (!nextStep || nextStep.totalScore > newStep.totalScore) queue[lcaDepth].set(lca.anchor, newStep); // Try going to the ancestor. if (newStep.depth) { newStep = { token: { combinator: '^' }, previous: newStep, depth: 0, element: lca.lca, score: 2000 * newStep.depth, totalScore: newStep.totalScore + 2000 * newStep.depth, repeat: newStep.depth }; nextStep = queue[lcaDepth].get(undefined); if (!nextStep || nextStep.totalScore > newStep.totalScore) queue[lcaDepth].set(undefined, newStep); } } } } } let best: Step | undefined; for (const [, step] of queue[path.length - 1]) { if (!best || step.totalScore < best.totalScore) best = step; } if (!best) return ''; const tokens: Token[] = new Array(best.depth).fill({ combinator: '^' }); while (best && best !== startStep) { for (let repeat = best.repeat || 1; repeat; repeat--) tokens.push(best.token); best = best.previous; } tokens.reverse(); return this._serialize(tokens); } private _textMetric(text: string): number { // Text which looks like a float number or counter is most likely volatile. if (/^\$?[\d,]+(\.\d+|(\.\d+)?[kKmMbBgG])?$/.test(text)) return 12; const num = Number(text); // Large numbers are likely volatile. if (!isNaN(num) && (num >= 32 || num < 0)) return 6; return 1; } private _elementMetrics(element: Element): ElementMetrics { let metrics = this._metrics.get(element); if (!metrics) { const style = element.ownerDocument ? element.ownerDocument.defaultView!.getComputedStyle(element) : ({} as CSSStyleDeclaration); const box = element.getBoundingClientRect(); const fontSize = (parseInt(style.fontSize || '', 10) || 12) / 12; // default 12 px const fontWeight = (parseInt(style.fontWeight || '', 10) || 400) / 400; // default normal weight let fontMetric = fontSize * (1 + (fontWeight - 1) / 5); fontMetric = 1 / Math.exp(fontMetric - 1); metrics = { box, style, fontMetric }; this._metrics.set(element, metrics); } return metrics; } private _isVisible(element: Element): boolean { const metrics = this._elementMetrics(element); return metrics.box.width > 1 && metrics.box.height > 1; } private _preprocess(root: SelectorRoot, path: (Element | SelectorRoot)[], maxCueCount: number): PreprocessResult { const pathCues = new Map(); const lcaMap = new Map(); const textScore = this.options.textScore || 1; const appendCue = (text: string, type: CueType, score: number, element: Element, lca: Lca, textValue: string) => { let pathCue = pathCues.get(text); if (!pathCue) { pathCue = { type, score: (textValue ? this._textMetric(textValue) : 1) * score, elements: [], anchorCount: new Map() }; for (let i = 0; i < path.length; i++) pathCue.elements.push([]); pathCues.set(text, pathCue); } for (let index = lca.lcaDepth; index >= 0; index--) { const elements = pathCue.elements[index]; if (elements.length < maxCueCount) elements.push(element); } if (lca.anchor) pathCue.anchorCount.set(lca.anchor, 1 + (pathCue.anchorCount.get(lca.anchor) || 0)); }; const appendElementCues = (element: Element, lca: Lca, detached: boolean) => { const nodeName = element.nodeName; if (!detached && this.options.usePlaceholders && nodeName === 'INPUT') { const placeholder = element.getAttribute('placeholder'); if (placeholder) appendCue(JSON.stringify(placeholder), 'text', textScore, element, lca, placeholder); } if (!detached && nodeName === 'INPUT' && element.getAttribute('type') === 'button') { const value = element.getAttribute('value'); if (value) appendCue(JSON.stringify(value), 'text', textScore, element, lca, value); } if (!nodeName.startsWith(' { // Check for elements STYLE, NOSCRIPT, SCRIPT, OPTION and other elements // that have |display:none| behavior. const detached = !(element as HTMLElement).offsetParent; if (element.nodeType === 1 /* Node.ELEMENT_NODE */) appendElementCues(element as Element, lca, detached); lcaMap.set(element, lca); for (let childNode = element.firstChild; childNode; childNode = childNode.nextSibling) { if (element.nodeType === 1 /* Node.ELEMENT_NODE */ && !detached && childNode.nodeType === 3 /* Node.TEXT_NODE */ && childNode.nodeValue) { const textValue = childNode.nodeValue.trim(); if (textValue) appendCue(JSON.stringify(textValue), 'text', textScore, element as Element, lca, textValue); } if (childNode.nodeType !== 1 /* Node.ELEMENT_NODE */) continue; const childElement = childNode as Element; if (childElement.nodeName.startsWith(' root.contains(element)); if (!filtered.length) continue; const newCue: Cue = { type: cue.type, score: cue.score, elements: filtered }; result.set(text, newCue); } return result; } private _buildLists(root: Element | SelectorRoot, path: (Element | SelectorRoot)[]): ListIndex { const pathSet = new Set(path); const map = detectLists(root, e => pathSet.has(e), e => this._elementMetrics(e).box); const result: ListIndex = new Map(); let listNumber = 1; for (const collection of map.values()) { for (const list of collection) { for (const child of list) result.set(child, listNumber); ++listNumber; } } return result; } private _matchChildren(parent: Element | SelectorRoot, token: Token, all: boolean): Element[] { const result: Element[] = []; if (token.index !== undefined) all = false; let index = token.index || 0; if (token.css !== undefined) { for (let child = parent.firstElementChild; child; child = child.nextElementSibling) { if (child.matches(token.css) && (all || !index--)) { result.push(child); if (!all) return result; } } return result; } if (token.text !== undefined) { const cue = this._getCues(parent).get(token.text); if (!cue || cue.type !== 'text') return []; for (const element of cue.elements) { if (parentOrRoot(element) === parent && (all || !index--)) { result.push(element); if (!all) return result; } } return result; } throw new Error('Unsupported token'); } private _matchSubtree(root: Element | SelectorRoot, token: Token, all: boolean): Element[] { const result: Element[] = []; if (token.index !== undefined) all = false; let index = token.index || 0; if (token.css !== undefined) { if (root.nodeType === 1 /* Node.ELEMENT_NODE */) { const rootElement = root as Element; if (rootElement.matches(token.css) && (all || !index--)) { result.push(rootElement); if (!all) return result; } } const queried = root.querySelectorAll(token.css); if (all) result.push(...Array.from(queried)); else if (queried.length > index) result.push(queried.item(index)); return result; } if (token.text !== undefined) { const texts = this._getCues(root); const cue = texts.get(token.text); if (!cue || cue.type !== 'text') return result; if (all) return cue.elements; if (index < cue.elements.length) result.push(cue.elements[index]); return result; } throw new Error('Unsupported token'); } private _getCues(element: Element | SelectorRoot): CueMap { if (!this._cues.has(element)) { let parent = element; while (!this._cues.has(parent)) parent = parentOrRoot(parent)!; this._cues.set(element, this._filterCues(this._cues.get(parent)!, element)); } return this._cues.get(element)!; } private _serialize(tokens: Token[]): string { const result = tokens.map(token => (token.combinator === '' ? ' ' : token.combinator) + (token.text !== undefined ? token.text : '') + (token.css !== undefined ? token.css : '') + (token.index !== undefined ? '#' + token.index : '')).join(''); if (result[0] !== ' ') throw new Error('First token is wrong'); return result.substring(1); } } const ZSSelectorEngine: SelectorEngine = { name: 'zs', create(root: SelectorRoot, element: Element, type?: SelectorType): string { return new Engine().create(root, element, type || 'default'); }, query(root: SelectorRoot, selector: string): Element | undefined { return new Engine().query(root, selector, false /* all */)[0]; }, queryAll(root: SelectorRoot, selector: string): Element[] { return new Engine().query(root, selector, true /* all */); } }; (ZSSelectorEngine as any).test = () => { const elements = Array.from(document.querySelectorAll('*')).slice(1500, 2000); console.time('test'); // eslint-disable-line no-console const failures = elements.filter((e, index) => { const name = e.tagName.toUpperCase(); if (name === 'SCRIPT' || name === 'STYLE' || name === 'NOSCRIPT' || name === 'META' || name === 'LINK' || name === 'OPTION') return false; if (index % 100 === 0) console.log(`${index} / ${elements.length}`); // eslint-disable-line no-console if (e.nodeName.toLowerCase().startsWith('