diff --git a/src/common/string/filter/filter.ts b/src/common/string/filter/filter.ts index d3471433ab..ff42c20393 100644 --- a/src/common/string/filter/filter.ts +++ b/src/common/string/filter/filter.ts @@ -34,14 +34,12 @@ const _maxLen = 128; function initTable() { const table: number[][] = []; - const row: number[] = [0]; - for (let i = 1; i <= _maxLen; i++) { - row.push(-i); + const row: number[] = []; + for (let i = 0; i <= _maxLen; i++) { + row[i] = 0; } for (let i = 0; i <= _maxLen; i++) { - const thisRow = row.slice(0); - thisRow[0] = -i; - table.push(thisRow); + table.push(row.slice(0)); } return table; } @@ -50,7 +48,7 @@ function isSeparatorAtPos(value: string, index: number): boolean { if (index < 0 || index >= value.length) { return false; } - const code = value.charCodeAt(index); + const code = value.codePointAt(index); switch (code) { case CharCode.Underline: case CharCode.Dash: @@ -62,8 +60,16 @@ function isSeparatorAtPos(value: string, index: number): boolean { case CharCode.DoubleQuote: case CharCode.Colon: case CharCode.DollarSign: + case CharCode.LessThan: + case CharCode.OpenParen: + case CharCode.OpenSquareBracket: return true; + case undefined: + return false; default: + if (isEmojiImprecise(code)) { + return true; + } return false; } } @@ -92,10 +98,15 @@ function isPatternInWord( patternLen: number, wordLow: string, wordPos: number, - wordLen: number + wordLen: number, + fillMinWordPosArr = false ): boolean { while (patternPos < patternLen && wordPos < wordLen) { if (patternLow[patternPos] === wordLow[wordPos]) { + if (fillMinWordPosArr) { + // Remember the min word position for each pattern position + _minWordMatchPos[patternPos] = wordPos; + } patternPos += 1; } wordPos += 1; @@ -104,42 +115,22 @@ function isPatternInWord( } enum Arrow { - Top = 0b1, - Diag = 0b10, - Left = 0b100, + Diag = 1, + Left = 2, + LeftLeft = 3, } /** - * A tuple of three values. + * An array representating a fuzzy match. + * * 0. the score - * 1. the matches encoded as bitmask (2^53) - * 2. the offset at which matching started + * 1. the offset at which matching started + * 2. `` + * 3. `` + * 4. `` etc */ -export type FuzzyScore = [number, number, number]; - -interface FilterGlobals { - _matchesCount: number; - _topMatch2: number; - _topScore: number; - _wordStart: number; - _firstMatchCanBeWeak: boolean; - _table: number[][]; - _scores: number[][]; - _arrows: Arrow[][]; -} - -function initGlobals(): FilterGlobals { - return { - _matchesCount: 0, - _topMatch2: 0, - _topScore: 0, - _wordStart: 0, - _firstMatchCanBeWeak: false, - _table: initTable(), - _scores: initTable(), - _arrows: initTable(), - }; -} +// export type FuzzyScore = [score: number, wordStart: number, ...matches: number[]];// [number, number, number]; +export type FuzzyScore = Array; export function fuzzyScore( pattern: string, @@ -150,7 +141,6 @@ export function fuzzyScore( wordStart: number, firstMatchCanBeWeak: boolean ): FuzzyScore | undefined { - const globals = initGlobals(); const patternLen = pattern.length > _maxLen ? _maxLen : pattern.length; const wordLen = word.length > _maxLen ? _maxLen : word.length; @@ -172,18 +162,30 @@ export function fuzzyScore( patternLen, wordLow, wordStart, - wordLen + wordLen, + true ) ) { return undefined; } + // Find the max matching word position for each pattern position + // NOTE: the min matching word position was filled in above, in the `isPatternInWord` call + _fillInMaxWordMatchPos( + patternLen, + wordLen, + patternStart, + wordStart, + patternLow, + wordLow + ); + let row = 1; let column = 1; let patternPos = patternStart; let wordPos = wordStart; - let hasStrongFirstMatch = false; + const hasStrongFirstMatch = [false]; // There will be a match, fill in tables for ( @@ -191,83 +193,146 @@ export function fuzzyScore( patternPos < patternLen; row++, patternPos++ ) { + // Reduce search space to possible matching word positions and to possible access from next row + const minWordMatchPos = _minWordMatchPos[patternPos]; + const maxWordMatchPos = _maxWordMatchPos[patternPos]; + const nextMaxWordMatchPos = + patternPos + 1 < patternLen ? _maxWordMatchPos[patternPos + 1] : wordLen; + for ( - column = 1, wordPos = wordStart; - wordPos < wordLen; + column = minWordMatchPos - wordStart + 1, wordPos = minWordMatchPos; + wordPos < nextMaxWordMatchPos; column++, wordPos++ ) { - const score = _doScore( - pattern, - patternLow, - patternPos, - patternStart, - word, - wordLow, - wordPos - ); + let score = Number.MIN_SAFE_INTEGER; + let canComeDiag = false; - if (patternPos === patternStart && score > 1) { - hasStrongFirstMatch = true; + if (wordPos <= maxWordMatchPos) { + score = _doScore( + pattern, + patternLow, + patternPos, + patternStart, + word, + wordLow, + wordPos, + wordLen, + wordStart, + _diag[row - 1][column - 1] === 0, + hasStrongFirstMatch + ); } - globals._scores[row][column] = score; + let diagScore = 0; + if (score !== Number.MAX_SAFE_INTEGER) { + canComeDiag = true; + diagScore = score + _table[row - 1][column - 1]; + } - const diag = - globals._table[row - 1][column - 1] + (score > 1 ? 1 : score); - const top = globals._table[row - 1][column] + -1; - const left = globals._table[row][column - 1] + -1; + const canComeLeft = wordPos > minWordMatchPos; + const leftScore = canComeLeft + ? _table[row][column - 1] + (_diag[row][column - 1] > 0 ? -5 : 0) + : 0; // penalty for a gap start - if (left >= top) { - // left or diag - if (left > diag) { - globals._table[row][column] = left; - globals._arrows[row][column] = Arrow.Left; - } else if (left === diag) { - globals._table[row][column] = left; - globals._arrows[row][column] = Arrow.Left || Arrow.Diag; - } else { - globals._table[row][column] = diag; - globals._arrows[row][column] = Arrow.Diag; - } - } else if (top > diag) { - globals._table[row][column] = top; - globals._arrows[row][column] = Arrow.Top; - } else if (top === diag) { - globals._table[row][column] = top; - globals._arrows[row][column] = Arrow.Top || Arrow.Diag; + const canComeLeftLeft = + wordPos > minWordMatchPos + 1 && _diag[row][column - 1] > 0; + const leftLeftScore = canComeLeftLeft + ? _table[row][column - 2] + (_diag[row][column - 2] > 0 ? -5 : 0) + : 0; // penalty for a gap start + + if ( + canComeLeftLeft && + (!canComeLeft || leftLeftScore >= leftScore) && + (!canComeDiag || leftLeftScore >= diagScore) + ) { + // always prefer choosing left left to jump over a diagonal because that means a match is earlier in the word + _table[row][column] = leftLeftScore; + _arrows[row][column] = Arrow.LeftLeft; + _diag[row][column] = 0; + } else if (canComeLeft && (!canComeDiag || leftScore >= diagScore)) { + // always prefer choosing left since that means a match is earlier in the word + _table[row][column] = leftScore; + _arrows[row][column] = Arrow.Left; + _diag[row][column] = 0; + } else if (canComeDiag) { + _table[row][column] = diagScore; + _arrows[row][column] = Arrow.Diag; + _diag[row][column] = _diag[row - 1][column - 1] + 1; } else { - globals._table[row][column] = diag; - globals._arrows[row][column] = Arrow.Diag; + throw new Error(`not possible`); } } } if (_debug) { - printTables(pattern, patternStart, word, wordStart, globals); + printTables(pattern, patternStart, word, wordStart); } - if (!hasStrongFirstMatch && !firstMatchCanBeWeak) { + if (!hasStrongFirstMatch[0] && !firstMatchCanBeWeak) { return undefined; } - globals._matchesCount = 0; - globals._topScore = -100; - globals._wordStart = wordStart; - globals._firstMatchCanBeWeak = firstMatchCanBeWeak; + row--; + column--; - _findAllMatches2( - row - 1, - column - 1, - patternLen === wordLen ? 1 : 0, - 0, - false, - globals - ); - if (globals._matchesCount === 0) { - return undefined; + const result: FuzzyScore = [_table[row][column], wordStart]; + + let backwardsDiagLength = 0; + let maxMatchColumn = 0; + + while (row >= 1) { + // Find the column where we go diagonally up + let diagColumn = column; + do { + const arrow = _arrows[row][diagColumn]; + if (arrow === Arrow.LeftLeft) { + diagColumn -= 2; + } else if (arrow === Arrow.Left) { + diagColumn -= 1; + } else { + // found the diagonal + break; + } + } while (diagColumn >= 1); + + // Overturn the "forwards" decision if keeping the "backwards" diagonal would give a better match + if ( + backwardsDiagLength > 1 && // only if we would have a contiguous match of 3 characters + patternLow[patternStart + row - 1] === wordLow[wordStart + column - 1] && // only if we can do a contiguous match diagonally + !isUpperCaseAtPos(diagColumn + wordStart - 1, word, wordLow) && // only if the forwards chose diagonal is not an uppercase + backwardsDiagLength + 1 > _diag[row][diagColumn] // only if our contiguous match would be longer than the "forwards" contiguous match + ) { + diagColumn = column; + } + + if (diagColumn === column) { + // this is a contiguous match + backwardsDiagLength++; + } else { + backwardsDiagLength = 1; + } + + if (!maxMatchColumn) { + // remember the last matched column + maxMatchColumn = diagColumn; + } + + row--; + column = diagColumn - 1; + result.push(column); } - return [globals._topScore, globals._topMatch2, wordStart]; + if (wordLen === patternLen) { + // the word matches the pattern with all characters! + // giving the score a total match boost (to come up ahead other words) + result[0] += 2; + } + + // Add 1 penalty for each skipped character in the word + const skippedCharsCount = maxMatchColumn - patternLen; + result[0] -= skippedCharsCount; + + return result; } function _doScore( @@ -277,50 +342,81 @@ function _doScore( patternStart: number, word: string, wordLow: string, - wordPos: number -) { + wordPos: number, + wordLen: number, + wordStart: number, + newMatchStart: boolean, + outFirstMatchStrong: boolean[] +): number { if (patternLow[patternPos] !== wordLow[wordPos]) { - return -1; + return Number.MIN_SAFE_INTEGER; } + + let score = 1; + let isGapLocation = false; if (wordPos === patternPos - patternStart) { // common prefix: `foobar <-> foobaz` // ^^^^^ - if (pattern[patternPos] === word[wordPos]) { - return 7; - } - return 5; - } - - if ( + score = pattern[patternPos] === word[wordPos] ? 7 : 5; + } else if ( isUpperCaseAtPos(wordPos, word, wordLow) && (wordPos === 0 || !isUpperCaseAtPos(wordPos - 1, word, wordLow)) ) { // hitting upper-case: `foo <-> forOthers` // ^^ ^ - if (pattern[patternPos] === word[wordPos]) { - return 7; - } - return 5; - } - - if ( + score = pattern[patternPos] === word[wordPos] ? 7 : 5; + isGapLocation = true; + } else if ( isSeparatorAtPos(wordLow, wordPos) && (wordPos === 0 || !isSeparatorAtPos(wordLow, wordPos - 1)) ) { // hitting a separator: `. <-> foo.bar` // ^ - return 5; - } - - if ( + score = 5; + } else if ( isSeparatorAtPos(wordLow, wordPos - 1) || isWhitespaceAtPos(wordLow, wordPos - 1) ) { // post separator: `foo <-> bar_foo` // ^^^ - return 5; + score = 5; + isGapLocation = true; } - return 1; + + if (score > 1 && patternPos === patternStart) { + outFirstMatchStrong[0] = true; + } + + if (!isGapLocation) { + isGapLocation = + isUpperCaseAtPos(wordPos, word, wordLow) || + isSeparatorAtPos(wordLow, wordPos - 1) || + isWhitespaceAtPos(wordLow, wordPos - 1); + } + + // + if (patternPos === patternStart) { + // first character in pattern + if (wordPos > wordStart) { + // the first pattern character would match a word character that is not at the word start + // so introduce a penalty to account for the gap preceding this match + score -= isGapLocation ? 3 : 5; + } + } else if (newMatchStart) { + // this would be the beginning of a new match (i.e. there would be a gap before this location) + score += isGapLocation ? 2 : 0; + } else { + // this is part of a contiguous match, so give it a slight bonus, but do so only if it would not be a prefered gap location + score += isGapLocation ? 0 : 1; + } + + if (wordPos + 1 === wordLen) { + // we always penalize gaps, but this gives unfair advantages to a match that would match the last character in the word + // so pretend there is a gap after the last character in the word to normalize things + score -= isGapLocation ? 3 : 5; + } + + return score; } function printTable( @@ -360,104 +456,96 @@ function printTables( pattern: string, patternStart: number, word: string, - wordStart: number, - globals: FilterGlobals + wordStart: number ): void { pattern = pattern.substr(patternStart); word = word.substr(wordStart); - console.log( - printTable(globals._table, pattern, pattern.length, word, word.length) - ); - console.log( - printTable(globals._arrows, pattern, pattern.length, word, word.length) - ); - console.log( - printTable(globals._scores, pattern, pattern.length, word, word.length) - ); + console.log(printTable(_table, pattern, pattern.length, word, word.length)); + console.log(printTable(_arrows, pattern, pattern.length, word, word.length)); + console.log(printTable(_diag, pattern, pattern.length, word, word.length)); } -function _findAllMatches2( - row: number, - column: number, - total: number, - matches: number, - lastMatched: boolean, - globals: FilterGlobals -): void { - if (globals._matchesCount >= 10 || total < -25) { - // stop when having already 10 results, or - // when a potential alignment as already 5 gaps - return; +const _minWordMatchPos = initArr(2 * _maxLen); // min word position for a certain pattern position +const _maxWordMatchPos = initArr(2 * _maxLen); // max word position for a certain pattern position +const _diag = initTable(); // the length of a contiguous diagonal match +const _table = initTable(); +const _arrows = initTable(); + +function initArr(maxLen: number) { + const row: number[] = []; + for (let i = 0; i <= maxLen; i++) { + row[i] = 0; } + return row; +} - let simpleMatchCount = 0; +function _fillInMaxWordMatchPos( + patternLen: number, + wordLen: number, + patternStart: number, + wordStart: number, + patternLow: string, + wordLow: string +) { + let patternPos = patternLen - 1; + let wordPos = wordLen - 1; + while (patternPos >= patternStart && wordPos >= wordStart) { + if (patternLow[patternPos] === wordLow[wordPos]) { + _maxWordMatchPos[patternPos] = wordPos; + patternPos--; + } + wordPos--; + } +} - while (row > 0 && column > 0) { - const score = globals._scores[row][column]; - const arrow = globals._arrows[row][column]; +export interface FuzzyScorer { + ( + pattern: string, + lowPattern: string, + patternPos: number, + word: string, + lowWord: string, + wordPos: number, + firstMatchCanBeWeak: boolean + ): FuzzyScore | undefined; +} - if (arrow === Arrow.Left) { - // left -> no match, skip a word character - column -= 1; - if (lastMatched) { - total -= 5; // new gap penalty - } else if (matches !== 0) { - total -= 1; // gap penalty after first match - } - lastMatched = false; - simpleMatchCount = 0; - } else if (arrow && Arrow.Diag) { - if (arrow && Arrow.Left) { - // left - _findAllMatches2( - row, - column - 1, - matches !== 0 ? total - 1 : total, // gap penalty after first match - matches, - lastMatched, - globals - ); - } - - // diag - total += score; - row -= 1; - column -= 1; - lastMatched = true; - - // match -> set a 1 at the word pos - matches += 2 ** (column + globals._wordStart); - - // count simple matches and boost a row of - // simple matches when they yield in a - // strong match. - if (score === 1) { - simpleMatchCount += 1; - - if (row === 0 && !globals._firstMatchCanBeWeak) { - // when the first match is a weak - // match we discard it - return; - } - } else { - // boost - total += 1 + simpleMatchCount * (score - 1); - simpleMatchCount = 0; - } +export function createMatches(score: undefined | FuzzyScore): Match[] { + if (typeof score === "undefined") { + return []; + } + const res: Match[] = []; + const wordPos = score[1]; + for (let i = score.length - 1; i > 1; i--) { + const pos = score[i] + wordPos; + const last = res[res.length - 1]; + if (last && last.end === pos) { + last.end = pos + 1; } else { - return; + res.push({ start: pos, end: pos + 1 }); } } - - total -= column >= 3 ? 9 : column * 3; // late start penalty - - // dynamically keep track of the current top score - // and insert the current best score at head, the rest at tail - globals._matchesCount += 1; - if (total > globals._topScore) { - globals._topScore = total; - globals._topMatch2 = matches; - } + return res; } -// #endregion +/** + * A fast function (therefore imprecise) to check if code points are emojis. + * Generated using https://github.com/alexdima/unicode-utils/blob/master/generate-emoji-test.js + */ +export function isEmojiImprecise(x: number): boolean { + return ( + (x >= 0x1f1e6 && x <= 0x1f1ff) || + x === 8986 || + x === 8987 || + x === 9200 || + x === 9203 || + (x >= 9728 && x <= 10175) || + x === 11088 || + x === 11093 || + (x >= 127744 && x <= 128591) || + (x >= 128640 && x <= 128764) || + (x >= 128992 && x <= 129003) || + (x >= 129280 && x <= 129535) || + (x >= 129648 && x <= 129750) + ); +} diff --git a/src/common/string/filter/sequence-matching.ts b/src/common/string/filter/sequence-matching.ts index 6e4a5bf2e5..aa67093497 100644 --- a/src/common/string/filter/sequence-matching.ts +++ b/src/common/string/filter/sequence-matching.ts @@ -11,7 +11,7 @@ import { fuzzyScore } from "./filter"; */ export const fuzzySequentialMatch = (filter: string, ...words: string[]) => { - let topScore = 0; + let topScore = Number.NEGATIVE_INFINITY; for (const word of words) { const scores = fuzzyScore( @@ -28,15 +28,24 @@ export const fuzzySequentialMatch = (filter: string, ...words: string[]) => { continue; } - // The VS Code implementation of filter treats a score of "0" as just barely a match - // But we will typically use this matcher in a .filter(), which interprets 0 as a failure. - // By shifting all scores up by 1, we allow "0" matches, while retaining score precedence - const score = scores[0] + 1; + // The VS Code implementation of filter returns a: + // - Negative score for a good match that starts in the middle of the string + // - Positive score if the match starts at the beginning of the string + // - 0 if the filter string is just barely a match + // - undefined for no match + // The "0" return is problematic since .filter() will remove that match, even though a 0 == good match. + // So, if we encounter a 0 return, set it to 1 so the match will be included, and still respect ordering. + const score = scores[0] === 0 ? 1 : scores[0]; if (score > topScore) { topScore = score; } } + + if (topScore === Number.NEGATIVE_INFINITY) { + return undefined; + } + return topScore; }; @@ -59,7 +68,7 @@ export const fuzzyFilterSort: FuzzyFilterSort = (filter, items) => { : fuzzySequentialMatch(filter, item.filterText); return item; }) - .filter((item) => item.score !== undefined && item.score > 0) + .filter((item) => item.score !== undefined) .sort(({ score: scoreA = 0 }, { score: scoreB = 0 }) => scoreA > scoreB ? -1 : scoreA < scoreB ? 1 : 0 ); diff --git a/test-mocha/common/string/sequence_matching.test.ts b/test-mocha/common/string/sequence_matching.test.ts index 48400606ea..1b079cb4ba 100644 --- a/test-mocha/common/string/sequence_matching.test.ts +++ b/test-mocha/common/string/sequence_matching.test.ts @@ -20,25 +20,25 @@ describe("fuzzySequentialMatch", () => { }); const shouldMatchEntity = [ - createExpectation("automation.ticker", 138), - createExpectation("automation.ticke", 129), - createExpectation("automation.", 89), - createExpectation("au", 17), - createExpectation("automationticker", 107), - createExpectation("tion.tick", 18), - createExpectation("ticker", 1), - createExpectation("automation.r", 89), - createExpectation("tick", 1), - createExpectation("aumatick", 15), - createExpectation("aion.tck", 14), - createExpectation("ioticker", 19), - createExpectation("atmto.ikr", 1), - createExpectation("uoaintce", 1), - createExpectation("au.tce", 17), - createExpectation("tomaontkr", 9), - createExpectation("s", 7), - createExpectation("stocks", 48), - createExpectation("sks", 7), + createExpectation("automation.ticker", 131), + createExpectation("automation.ticke", 121), + createExpectation("automation.", 82), + createExpectation("au", 10), + createExpectation("automationticker", 85), + createExpectation("tion.tick", 8), + createExpectation("ticker", -4), + createExpectation("automation.r", 73), + createExpectation("tick", -8), + createExpectation("aumatick", 9), + createExpectation("aion.tck", 4), + createExpectation("ioticker", -4), + createExpectation("atmto.ikr", -34), + createExpectation("uoaintce", -39), + createExpectation("au.tce", -3), + createExpectation("tomaontkr", -19), + createExpectation("s", 1), + createExpectation("stocks", 42), + createExpectation("sks", -5), ]; const shouldNotMatchEntity = [ @@ -72,7 +72,7 @@ describe("fuzzySequentialMatch", () => { entity.entity_id, entity.friendly_name ); - assert.equal(res, 0); + assert.equal(res, undefined); }); } }); @@ -80,24 +80,45 @@ describe("fuzzySequentialMatch", () => { describe("fuzzyFilterSort", () => { const filter = "ticker"; - const item1 = { + const automationTicker = { filterText: "automation.ticker", altText: "Stocks", score: 0, }; - const item2 = { filterText: "sensor.ticker", altText: "Stocks up", score: 0 }; - const item3 = { + const ticker = { + filterText: "ticker", + altText: "Just ticker", + score: 0, + }; + const sensorTicker = { + filterText: "sensor.ticker", + altText: "Stocks up", + score: 0, + }; + const timerCheckRouter = { filterText: "automation.check_router", altText: "Timer Check Router", score: 0, }; - const itemsBeforeFilter = [item1, item2, item3]; + const badMatch = { + filterText: "light.chandelier", + altText: "Chandelier", + score: 0, + }; + const itemsBeforeFilter = [ + automationTicker, + sensorTicker, + timerCheckRouter, + ticker, + badMatch, + ]; - it(`sorts correctly`, () => { + it(`filters and sorts correctly`, () => { const expectedItemsAfterFilter = [ - { ...item2, score: 23 }, - { ...item3, score: 12 }, - { ...item1, score: 1 }, + { ...ticker, score: 44 }, + { ...sensorTicker, score: 1 }, + { ...automationTicker, score: -4 }, + { ...timerCheckRouter, score: -8 }, ]; const res = fuzzyFilterSort(filter, itemsBeforeFilter);