From 7b289f8e583d3bb6027ed2c9ada5a153d67579dd Mon Sep 17 00:00:00 2001 From: Abdullah Atta Date: Mon, 2 Jun 2025 15:32:47 +0500 Subject: [PATCH] core: sort content matches --- packages/core/src/api/lookup.ts | 96 ++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 8 deletions(-) diff --git a/packages/core/src/api/lookup.ts b/packages/core/src/api/lookup.ts index 20d0c68c0..0765da3bb 100644 --- a/packages/core/src/api/lookup.ts +++ b/packages/core/src/api/lookup.ts @@ -276,6 +276,14 @@ export default class Lookup { } } + for (const result of results) { + result.content.sort( + (a, b) => + getMatchScore(b, highlightTokens) - + getMatchScore(a, highlightTokens) + ); + } + return { ids: results.map((c) => c.id), items: results @@ -885,13 +893,85 @@ function highlightHtmlContent(html: string, queries: string[]): string { parser.write(html); parser.end(); - // Process any remaining text - if (textBuffer) { - result += textBuffer.replace( - searchRegex, - "$1" - ); - } - return result; } + +interface MatchScoreOptions { + lengthMultiplier: number; // Weight for match length + positionPenalty: number; // Penalty for matches further down + consecutiveBonus: number; // Bonus for different consecutive token matches + repetitionPenalty: number; // Penalty for same token repeated consecutively + uniqueTokenBonus: number; // Large bonus for each unique token matched + completeWordBonus: number; // Bonus for complete word matches +} + +const DEFAULT_SCORE_OPTIONS: MatchScoreOptions = { + lengthMultiplier: 1.5, // Favor longer matches + positionPenalty: 0.05, // Small penalty for each position down + consecutiveBonus: 2.0, // Bonus for consecutive different tokens + repetitionPenalty: 0.5, // Significant penalty for repetition + uniqueTokenBonus: 10.0, // Large bonus for each unique token + completeWordBonus: 5.0 // Significant bonus for complete word matches +}; + +function isCompleteWord(match: Match): boolean { + const prefixEndsWithSpace = /\s$/.test(match.prefix) || match.prefix === ""; + const suffixStartsWithSpace = /^\s/.test(match.suffix) || match.suffix === ""; + return prefixEndsWithSpace && suffixStartsWithSpace; +} + +function getMatchScore( + matches: Match[], + tokens: string[], + options: MatchScoreOptions = DEFAULT_SCORE_OPTIONS +): number { + let score = 0; + let lastMatchText = ""; + let repetitionCount = 0; + const uniqueTokens = new Set(); + + matches.forEach((match, index) => { + const matchText = match.match.toLowerCase(); + let matchScore = 0; + + // Get matching tokens for this match + const matchingTokens = tokens.filter((token) => + matchText.includes(token.toLowerCase()) + ); + + // Add to unique tokens set + matchingTokens.forEach((token) => { + uniqueTokens.add(token.toLowerCase()); + }); + + // Base score from match length + matchScore += match.match.length * options.lengthMultiplier; + + // Check if it's a complete word only once per match + if (isCompleteWord(match)) { + matchScore += options.completeWordBonus; + } + + // Position penalty + matchScore *= 1 - index * options.positionPenalty; + + // Handle consecutive matches and repetition + if (index > 0) { + if (matchText === lastMatchText) { + repetitionCount++; + matchScore *= Math.pow(options.repetitionPenalty, repetitionCount); + } else { + matchScore *= options.consecutiveBonus; + repetitionCount = 0; + } + } + + lastMatchText = matchText; + score += matchScore; + }); + + // Add unique token bonus once at the end + score += uniqueTokens.size * options.uniqueTokenBonus; + + return score; +}