diff --git a/packages/core/src/api/lookup.ts b/packages/core/src/api/lookup.ts index 3cb71940b..161950268 100644 --- a/packages/core/src/api/lookup.ts +++ b/packages/core/src/api/lookup.ts @@ -38,8 +38,9 @@ import { rebuildSearchIndex } from "../database/fts.js"; import { transformQuery } from "../utils/query-transformer.js"; import { getSortSelectors, groupArray } from "../utils/grouping.js"; import { fuzzy } from "../utils/fuzzy.js"; -import { extractMatchingBlocks, extractText } from "../utils/html-parser.js"; +import { extractMatchingBlocks } from "../utils/html-parser.js"; import { findOrAdd } from "../utils/array.js"; +import { Parser } from "htmlparser2"; type SearchResults = { sorted: (sortOptions?: SortOptions) => Promise>; @@ -66,7 +67,6 @@ export default class Lookup { notes(query: string, notes?: FilteredSelector): SearchResults { return this.toSearchResults(async (limit, sortOptions) => { - const db = this.db.sql() as unknown as Kysely; const excludedIds = this.db.trash.cache.notes; const { query: transformedQuery, tokens } = transformQuery(query); @@ -74,36 +74,7 @@ export default class Lookup { const resultsA: string[] = transformedQuery.length === 0 ? [] - : await db - .selectFrom((eb) => - eb - .selectFrom("notes_fts") - .$if(!!notes, (eb) => - eb.where("id", "in", notes!.filter.select("id")) - ) - .$if(excludedIds.length > 0, (eb) => - eb.where("id", "not in", excludedIds) - ) - .where("title", "match", transformedQuery) - .select(["id", sql`rank * 10`.as("rank")]) - .unionAll((eb) => - eb - .selectFrom("content_fts") - .$if(!!notes, (eb) => - eb.where("noteId", "in", notes!.filter.select("id")) - ) - .$if(excludedIds.length > 0, (eb) => - eb.where("noteId", "not in", excludedIds) - ) - .where("data", "match", transformedQuery) - .select(["noteId as id", "rank"]) - .$castTo<{ - id: string; - rank: number; - }>() - ) - .as("results") - ) + : await this.ftsQueryBuilder(transformedQuery, excludedIds, notes) .select(["results.id"]) .groupBy("results.id") .orderBy( @@ -124,39 +95,14 @@ export default class Lookup { ); if (smallTokens.length === 0) return resultsA; - const results = []; - - const titles = await db - .selectFrom("notes") - .$if(!!transformedQuery && resultsA.length > 0, (eb) => - eb.where("id", "in", resultsA) - ) - .select(["id", "title"]) + const results = await this.regexQueryBuilder( + smallTokens, + !!transformedQuery && resultsA.length > 0 ? resultsA : notes + ) + .select("results.id") .execute(); - const htmls = await db - .selectFrom("content") - .$if(!!transformedQuery && resultsA.length > 0, (eb) => - eb.where("noteId", "in", resultsA) - ) - .select(["data", "noteId as id"]) - .$castTo<{ data: string; id: string }>() - .execute(); - - for (let i = 0; i < titles.length; i++) { - const title = titles[i]; - const html = htmls.find((h) => h.id === title.id); - const text = html ? extractText(html.data) : ""; - - if ( - smallTokens.every((token) => !!title.title?.includes(token)) || - smallTokens.every((token) => !!text?.includes(token)) - ) { - results.push(title.id); - } - } - - return results; + return results.map((r) => r.id); }, notes || this.db.notes.all); } @@ -170,91 +116,17 @@ export default class Lookup { const { query: transformedQuery, tokens } = transformQuery(query); - let mergedResults: HighlightedResult[] = []; - if (transformedQuery.length > 0) { - console.time("sql lookup"); - const results = await db - .selectFrom((eb) => - eb - .selectFrom("notes_fts") - .$if(!!notes, (eb) => - eb.where("id", "in", notes!.filter.select("id")) - ) - .$if(excludedIds.length > 0, (eb) => - eb.where("id", "not in", excludedIds) - ) - .where("title", "match", transformedQuery) - .select([ - "id", - sql`'title'`.as("type"), - sql`highlight(notes_fts, 1, '', '')`.as( - "match" - ), - sql`rank * 10`.as("rank") - ]) - .unionAll((eb) => - eb - .selectFrom("content_fts") - .$if(!!notes, (eb) => - eb.where("noteId", "in", notes!.filter.select("id")) - ) - .$if(excludedIds.length > 0, (eb) => - eb.where("noteId", "not in", excludedIds) - ) - .where("data", "match", transformedQuery) - .select([ - "noteId as id", - sql`'content'`.as("type"), - sql`highlight(content_fts, 2, '', '')`.as( - "match" - ), - "rank" - ]) - .$castTo<{ - id: string; - type: string; - rank: number; - match: string; - }>() - ) - .as("results") - ) - .select(["results.id", "results.match", "results.type", "results.rank"]) - .execute() - .catch((e) => { - logger.error(e, `Error while searching`, { query }); - return []; - }); - console.timeEnd("sql lookup"); - - console.time("merge results"); - for (const result of results) { - const old = findOrAdd(mergedResults, (r) => r.id === result.id, { - type: "searchResult", - id: result.id, - content: [], - title: [], - rank: 0, - rawContent: "", - dateCreated: 0, - dateModified: 0 - }); - - if (result.type === "content") { - old.content = extractMatchingBlocks( - result.match, - MATCH_TAG_NAME - ).flatMap((block) => { - return splitHighlightedMatch(block); - }); - old.rawContent = result.match; - } - if (result.type === "title") - old.title = splitHighlightedMatch(result.match).flatMap((m) => m); - old.rank += result.rank; - } - console.timeEnd("merge results"); - } + console.time("gather matches"); + const ftsResults = + transformedQuery.length <= 0 + ? [] + : await this.ftsQueryBuilder(transformedQuery, excludedIds, notes) + .select(["id", "type", "rank"]) + .execute() + .catch((e) => { + logger.error(e, `Error while searching`, { query }); + return []; + }); const smallTokens = Array.from( new Set( @@ -262,108 +134,233 @@ export default class Lookup { ).values() ); - if (smallTokens.length > 0) { - const ids = mergedResults.map((r) => r.id); - console.time("fetch titles"); - const titles = await db - .selectFrom("notes") - .$if(!!transformedQuery && ids.length > 0, (eb) => - eb.where("id", "in", ids) - ) - .select(["id", "title"]) - .execute(); - console.timeEnd("fetch titles"); + const ftsIds = ftsResults.map((r) => r.id); + const regexMatches = + smallTokens.length > 0 + ? await this.regexQueryBuilder( + smallTokens, + !!transformedQuery && ftsIds.length > 0 ? ftsIds : notes + ) + .select(["results.id", "results.type", sql`1`.as("rank")]) + .execute() + : []; + console.timeEnd("gather matches"); - console.time("fetch htmls"); - const htmls = await db - .selectFrom("content") - .where("content.locked", "!=", true) - .$if(!!transformedQuery && ids.length > 0, (eb) => - eb.where("noteId", "in", ids) - ) - .select(["data", "noteId as id"]) - .$castTo<{ data: string; id: string }>() - .execute(); - console.timeEnd("fetch htmls"); + console.time("sorting matches"); + type Matches = { + ids: string[]; + values: { id: string; types: string[]; rank: number }[]; + }; + let matches: Matches = { ids: [], values: [] }; + for (const array of [ftsResults, regexMatches]) + for (const { id, type, rank } of array) { + const index = matches.ids.indexOf(id); + const match = + index === -1 + ? { + id, + types: [], + rank: 0 + } + : matches.values[index]; + match.types.push(type); + match.rank += rank || 0; - console.time("small token lookup"); - for (let i = 0; i < titles.length; i++) { - const title = titles[i]; - const html = htmls.find((h) => h.id === title.id); - const text = html ? extractText(html.data) : ""; + if (index === -1) { + matches.ids.push(id); + matches.values.push(match); + } + } - if ( - (title.title && - smallTokens.every((token) => !!title.title?.includes(token))) || - (text && smallTokens.every((token) => !!text?.includes(token))) - ) { - const result = findOrAdd(mergedResults, (r) => r.id === title.id, { + if (!sortOptions || sortOptions.sortBy === "relevance") { + matches.values.sort( + sortOptions?.sortDirection === "desc" + ? (a, b) => a.rank - b.rank + : (a, b) => b.rank - a.rank + ); + matches.ids = matches.values.map((c) => c.id); + } else { + const sortedNoteIds = await this.db.notes.all + .fields(["notes.id"]) + .items(matches.ids, sortOptions); + const sorted: Matches = { ids: [], values: [] }; + for (const { id } of sortedNoteIds) { + const index = matches.ids.indexOf(id); + if (index === -1) continue; + sorted.values.push(matches.values[index]); + sorted.ids.push(id); + } + matches = sorted; + } + console.timeEnd("sorting matches"); + + return new VirtualizedGrouping( + matches.ids.length, + 20, + async () => matches.ids, + async (start, end) => { + const chunk = matches.values.slice(start, end); + const titleMatches = chunk + .filter((c) => c.types.includes("title")) + .map((c) => c.id); + const contentMatches = chunk + .filter((c) => c.types.includes("content")) + .map((c) => c.id); + const results: HighlightedResult[] = []; + + const titles = await db + .selectFrom("notes") + .where("id", "in", titleMatches) + .select(["id", "title"]) + .execute(); + + for (const title of titles) { + results.push({ id: title.id, - title: stringToMatch(title.title || ""), + title: splitHighlightedMatch( + highlightQueries(title.title || "", tokens).text + ).flatMap((m) => m), type: "searchResult", content: [], rank: 0, dateCreated: 0, dateModified: 0 }); - - const merged = mergeMatches( - result.title, - splitHighlightedMatch( - highlightQueries(title.title || "", smallTokens) - ).flatMap((m) => m) - ); - if (merged) result.title = merged; - - result.content.push( - ...splitHighlightedMatch(highlightQueries(text, smallTokens)) - ); } + + const htmls = await db + .selectFrom("content") + .where("noteId", "in", contentMatches) + .select(["data", "noteId as id"]) + .$castTo<{ data: string; id: string }>() + .execute(); + + for (const html of htmls) { + const result = findOrAdd(results, (r) => r.id === html.id, { + id: html.id, + title: [], + type: "searchResult", + content: [], + rank: 0, + dateCreated: 0, + dateModified: 0 + }); + const highlighted = highlightHtmlContent(html.data, tokens); + result.content = extractMatchingBlocks( + highlighted, + MATCH_TAG_NAME + ).flatMap((block) => { + return splitHighlightedMatch(block); + }); + if (result.content.length === 0) continue; + result.rawContent = highlighted; + } + + const resultsWithMissingTitle = results + .filter((r) => !r.title.length) + .map((r) => r.id); + + if (resultsWithMissingTitle.length > 0) { + const titles = await db + .selectFrom("notes") + .where("id", "in", resultsWithMissingTitle) + .select(["id", "title"]) + .execute(); + for (const title of titles) { + const result = results.find((r) => r.id === title.id); + if (!result || !title.title) continue; + result.title = stringToMatch(title.title); + } + } + + return { + ids: results.map((c) => c.id), + items: results + }; } - console.timeEnd("small token lookup"); - } + ); + } - const resultsWithMissingTitle = mergedResults - .filter((r) => !r.title.length) - .map((r) => r.id); + private ftsQueryBuilder( + query: string, + excludedIds: string[] = [], + filter?: FilteredSelector + ) { + const db = this.db.sql() as unknown as Kysely; - if (resultsWithMissingTitle.length > 0) { - console.time("missing title"); - const titles = await db + return db.selectFrom((eb) => + eb + .selectFrom("notes_fts") + .$if(!!filter, (eb) => + eb.where("id", "in", filter!.filter.select("id")) + ) + .$if(excludedIds.length > 0, (eb) => + eb.where("id", "not in", excludedIds) + ) + .where("title", "match", query) + .where("rank", "=", sql`'bm25(1.0, 10.0)'`) + .select(["id", "rank", sql`'title'`.as("type")]) + .unionAll((eb) => + eb + .selectFrom("content_fts") + .$if(!!filter, (eb) => + eb.where("noteId", "in", filter!.filter.select("id")) + ) + .$if(excludedIds.length > 0, (eb) => + eb.where("noteId", "not in", excludedIds) + ) + .where("data", "match", query) + .where("rank", "=", sql`'bm25(1.0, 1.0, 10.0)'`) + .select(["noteId as id", "rank", sql`'content'`.as("type")]) + .$castTo<{ + id: string; + rank: number; + type: "content" | "title"; + }>() + ) + .as("results") + ); + } + private regexQueryBuilder( + queries: string[], + ids?: string[] | FilteredSelector + ) { + const regex = queries + .filter((q) => q && q.length > 0) + .map((q) => q.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")) + .join("|"); + return this.db.sql().selectFrom((eb) => + eb .selectFrom("notes") - .where("id", "in", resultsWithMissingTitle) - .select(["id", "title"]) - .execute(); - for (const title of titles) { - const result = mergedResults.find((r) => r.id === title.id); - if (!result || !title.title) continue; - result.title = stringToMatch(title.title); - } - console.timeEnd("missing title"); - } - - mergedResults = mergedResults.filter((r) => !!r.title.length); - - if (!sortOptions || sortOptions.sortBy === "relevance") - mergedResults.sort( - sortOptions?.sortDirection === "desc" - ? (a, b) => a.rank - b.rank - : (a, b) => b.rank - a.rank - ); - else { - const sortedNoteIds = await this.db.notes.all.fields(["notes.id"]).items( - mergedResults.map((r) => r.id), - sortOptions - ); - const sorted: HighlightedResult[] = []; - for (const { id } of sortedNoteIds) { - const resultForId = mergedResults.find((r) => r.id === id); - if (!resultForId) continue; - sorted.push(resultForId); - } - mergedResults = sorted; - } - return arrayToVirtualizedGrouping(mergedResults, this.db.options.batchSize); + .$if(!!ids, (eb) => + eb.where( + "id", + "in", + Array.isArray(ids) ? ids! : ids!.filter.select("id") + ) + ) + .where("title", "regexp", sql`${regex}`) + .select(["id", sql`'title'`.as("type")]) + .unionAll((eb) => + eb + .selectFrom("content") + .where("content.locked", "!=", true) + .$if(!!ids, (eb) => + eb.where( + "noteId", + "in", + Array.isArray(ids) ? ids! : ids!.filter.select("id") + ) + ) + .where("data", "regexp", sql`${regex}`) + .select(["noteId as id", sql`'content'`.as("type")]) + .$castTo<{ + id: string; + type: "content" | "title"; + }>() + ) + .as("results") + ); } notebooks(query: string) { @@ -561,87 +558,39 @@ export default class Lookup { } } -function highlightQueries(text: string, queries: string[]): string { - if (!text || !queries.length) return text; +function highlightQueries( + text: string, + queries: string[] +): { text: string; hasMatches: boolean } { + if (!text || !queries.length) return { text, hasMatches: false }; - // Collect all ranges - const ranges = []; - const lowerText = text.toLowerCase(); + // Filter out empty queries and escape regex special characters + const patterns = queries + .filter((q) => q.length > 0) + .map((q) => q.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")); - for (let i = 0; i < queries.length; i++) { - const query = queries[i]; - const lowerQuery = query.toLowerCase(); - const queryLen = query.length; - let pos = 0; + if (patterns.length === 0) return { text, hasMatches: false }; - while ((pos = lowerText.indexOf(lowerQuery, pos)) !== -1) { - ranges.push({ - start: pos, - end: pos + queryLen, - len: queryLen - }); - pos += 1; - } + try { + // Create a single regex for all patterns + // Use word boundaries to avoid partial word matches if needed + // (?:) creates a non-capturing group + const regex = new RegExp(patterns.join("|"), "gi"); + + // Track if we found any matches + let hasMatches = false; + + // Replace all matches at once using a string builder approach + const result = text.replace(regex, (match) => { + hasMatches = true; + return `${MATCH_TAG_OPEN}${match}${MATCH_TAG_CLOSE}`; + }); + + return { text: result, hasMatches }; + } catch (error) { + // Fallback to original approach if regex fails (e.g., too large pattern) + return { text, hasMatches: false }; } - - if (!ranges.length) return text; - - // Sort by start position, then by length (longer first) - ranges.sort((a, b) => a.start - b.start || b.len - a.len); - - // Filter overlaps and merge adjacent ranges - const merged = [ranges[0]]; - for (let i = 1; i < ranges.length; i++) { - const current = ranges[i]; - const previous = merged[merged.length - 1]; - - if (current.start > previous.end) { - // No overlap or adjacency - add as new range - merged.push(current); - } else if (current.start === previous.end) { - // Adjacent ranges - merge them - previous.end = current.end; - previous.len = previous.end - previous.start; - } - // Overlapping ranges are skipped - } - - // Build result using array of parts - const parts = []; - let lastEnd = 0; - - for (const { start, end } of merged) { - if (start > lastEnd) { - parts.push(text.slice(lastEnd, start)); - } - parts.push(MATCH_TAG_OPEN, text.slice(start, end), MATCH_TAG_CLOSE); - lastEnd = end; - } - - if (lastEnd < text.length) { - parts.push(text.slice(lastEnd)); - } - - return parts.join(""); -} - -function arrayToVirtualizedGrouping( - array: T[], - batchSize: number -): VirtualizedGrouping { - return new VirtualizedGrouping( - array.length, - batchSize, - () => Promise.resolve(array.map((c) => c.id)), - async (start, end) => { - const items = array.slice(start, end); - return { - ids: items.map((i) => i.id), - items - }; - }, - (items) => groupArray(items, () => `${items.length} results`) - ); } export function splitHighlightedMatch(text: string): Match[][] { @@ -803,86 +752,78 @@ function stringToMatch(str: string): Match[] { ]; } -function mergeMatches(matches1: Match[], matches2: Match[]): Match[] | null { - if (!matches1.length) return matches2; - if (!matches2.length) return matches1; +function highlightHtmlContent(html: string, queries: string[]): string { + if (!html || !queries.length) return html; - // Helper to get full text from matches array - function getFullText(matches: Match[]): string { - if (!matches.length) return ""; - return matches.reduce( - (text, curr) => text + curr.prefix + curr.match + curr.suffix, - "" - ); - } + // Filter and escape regex special chars + const patterns = queries + .filter((q) => q && q.length > 0) + .map((q) => q.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")); - // Get the full original text - const text = getFullText(matches1); - if (getFullText(matches2) !== text) return null; + if (!patterns.length) return html; - // Create array of all match positions - type Position = { - start: number; - end: number; - match: string; - }; + // Create single regex for all patterns + const searchRegex = new RegExp(`(${patterns.join("|")})`, "gi"); - function getPositions(matches: Match[]) { - const positions: Position[] = []; - let pos = 0; - for (let i = 0; i < matches.length; i++) { - const m = matches[i]; - pos += m.prefix.length; - positions.push({ - start: pos, - end: pos + m.match.length, - match: m.match - }); - pos += m.match.length + m.suffix.length; + let result = ""; + let textBuffer = ""; + + // Create parser instance + const parser = new Parser( + { + ontext(text) { + // Process any accumulated text with search regex + textBuffer += text; + }, + onopentag(name, attributes) { + // When we hit a tag, process any accumulated text first + if (textBuffer) { + result += textBuffer.replace( + searchRegex, + "$1" + ); + textBuffer = ""; + } + // Add the tag with its attributes + result += `<${name}`; + for (const [key, value] of Object.entries(attributes)) { + result += ` ${key}="${value}"`; + } + result += ">"; + }, + onclosetag(name) { + // Process any text before closing tag + if (textBuffer) { + result += textBuffer.replace( + searchRegex, + "$1" + ); + textBuffer = ""; + } + result += ``; + }, + + onprocessinginstruction(name, data) { + // Preserve processing instructions (like ) + result += `<${data}>`; + } + }, + { + decodeEntities: false, // Preserve HTML entities + xmlMode: false // Handle HTML specifically } - return positions; - } - - const positions = [...getPositions(matches1), ...getPositions(matches2)].sort( - (a, b) => a.start - b.start || b.end - a.end ); - // Merge overlapping or adjacent positions - const merged: Position[] = []; - let current = positions[0]; + // Parse the HTML + parser.write(html); + parser.end(); - for (let i = 1; i < positions.length; i++) { - const next = positions[i]; - if (next.start <= current.end) { - // Overlapping or adjacent matches - if (next.end > current.end) { - // Extend current match if next one is longer - current = { - start: current.start, - end: next.end, - match: text.slice(current.start, next.end) - }; - } - } else { - merged.push(current); - current = next; - } - } - merged.push(current); - - // Create final matches array - const result: Match[] = []; - for (let i = 0; i < merged.length; i++) { - const pos = merged[i]; - const nextPos = merged[i + 1]; - - const prefix = i === 0 ? text.slice(0, pos.start) : ""; - const match = pos.match; - const suffix = nextPos - ? text.slice(pos.end, nextPos.start) - : text.slice(pos.end); - - result.push({ prefix, match, suffix }); + // Process any remaining text + if (textBuffer) { + result += textBuffer.replace( + searchRegex, + "$1" + ); } return result; diff --git a/packages/core/src/utils/html-parser.ts b/packages/core/src/utils/html-parser.ts index 6ef352fae..d14f229ba 100644 --- a/packages/core/src/utils/html-parser.ts +++ b/packages/core/src/utils/html-parser.ts @@ -131,26 +131,6 @@ export class HTMLParser { } } -export function extractText(html: string, retainTags?: string[]) { - let text = ""; - const parser = new Parser( - { - ontext: (data) => (text += data), - onopentag(name) { - if (retainTags?.includes(name)) text += `<${name}>`; - }, - onclosetag(name) { - if (retainTags?.includes(name)) text += ``; - } - }, - { - lowerCaseTags: false - } - ); - parser.end(html); - return text; -} - const INLINE_TAGS = [ "a", "abbr", diff --git a/packages/core/src/utils/query-transformer.ts b/packages/core/src/utils/query-transformer.ts index 723d69726..81fb4c34c 100644 --- a/packages/core/src/utils/query-transformer.ts +++ b/packages/core/src/utils/query-transformer.ts @@ -17,8 +17,6 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -import { EN_STOPWORDS } from "./stop-words.js"; - type ASTNode = QueryNode | PhraseNode | OperatorNode; type QueryNode = { @@ -163,9 +161,7 @@ function generateSQL(ast: QueryNode): string { } export function transformQuery(query: string) { - const tokens = tokenize(query).filter( - (token) => !EN_STOPWORDS.includes(token) - ); + const tokens = tokenize(query); const largeTokens = tokens.filter( (token) => token.length >= 3 || token === "OR" ); diff --git a/packages/core/src/utils/stop-words.ts b/packages/core/src/utils/stop-words.ts deleted file mode 100644 index feff544a7..000000000 --- a/packages/core/src/utils/stop-words.ts +++ /dev/null @@ -1,321 +0,0 @@ -/* -This file is part of the Notesnook project (https://notesnook.com/) - -Copyright (C) 2023 Streetwriters (Private) Limited - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . -*/ -export const EN_STOPWORDS = [ - "a", - "about", - "above", - "across", - "actually", - "after", - "again", - "against", - "all", - "almost", - "alone", - "along", - "already", - "also", - "although", - "always", - "among", - "an", - "and", - "another", - "any", - "anybody", - "anyone", - "anything", - "anywhere", - "are", - "around", - "as", - "ask", - "asked", - "asking", - "asks", - "at", - "away", - "b", - "be", - "became", - "because", - "become", - "becomes", - "becoming", - "been", - "before", - "behind", - "being", - "best", - "better", - "between", - "both", - "but", - "by", - "c", - "came", - "can", - "certain", - "certainly", - "clearly", - "come", - "consider", - "considering", - "could", - "d", - "did", - "different", - "do", - "does", - "doing", - "done", - "down", - "downwards", - "during", - "e", - "each", - "eg", - "eight", - "either", - "enough", - "even", - "ever", - "every", - "everybody", - "everyone", - "everything", - "everywhere", - "ex", - "exactly", - "example", - "except", - "f", - "fact", - "facts", - "far", - "few", - "first", - "five", - "for", - "four", - "from", - "further", - "g", - "get", - "gets", - "given", - "gives", - "go", - "going", - "got", - "h", - "had", - "has", - "have", - "having", - "he", - "her", - "here", - "herself", - "him", - "himself", - "his", - "how", - "however", - "i", - "ie", - "if", - "in", - "into", - "is", - "it", - "its", - "itself", - "j", - "just", - "k", - "keep", - "keeps", - "knew", - "know", - "known", - "knows", - "l", - "last", - "later", - "least", - "less", - "let", - "like", - "likely", - "m", - "many", - "may", - "me", - "might", - "more", - "most", - "mostly", - "much", - "must", - "my", - "myself", - "n", - "necessary", - "need", - "needs", - "never", - "new", - "next", - "nine", - "no", - "nobody", - "non", - "not", - "nothing", - "now", - "nowhere", - "o", - "of", - "off", - "often", - "old", - "on", - "once", - "one", - "only", - "or", - "other", - "others", - "our", - "out", - "over", - "p", - "per", - "perhaps", - "please", - "possible", - "put", - "q", - "quite", - "r", - "rather", - "really", - "right", - "s", - "said", - "same", - "saw", - "say", - "says", - "second", - "see", - "seem", - "seemed", - "seems", - "seven", - "several", - "shall", - "she", - "should", - "since", - "six", - "so", - "some", - "somebody", - "someone", - "something", - "somewhere", - "still", - "such", - "sure", - "t", - "take", - "taken", - "ten", - "than", - "that", - "the", - "their", - "them", - "then", - "there", - "therefore", - "therein", - "thereupon", - "these", - "they", - "think", - "third", - "this", - "those", - "though", - "three", - "through", - "thus", - "to", - "together", - "too", - "took", - "toward", - "two", - "u", - "under", - "until", - "up", - "upon", - "us", - "use", - "used", - "uses", - "v", - "very", - "w", - "want", - "wanted", - "wants", - "was", - "way", - "we", - "well", - "went", - "were", - "what", - "when", - "where", - "whether", - "which", - "while", - "who", - "whole", - "whose", - "why", - "will", - "with", - "within", - "without", - "would", - "x", - "y", - "yet", - "you", - "your", - "yours", - "z" -];