mirror of
https://github.com/colanode/colanode.git
synced 2025-12-16 11:47:47 +01:00
Enhance chunking service with comprehensive metadata retrieval and context enrichment
This commit is contained in:
@@ -1,23 +1,99 @@
|
||||
// Updated chunking-service.ts
|
||||
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
||||
import { configuration } from '@/lib/configuration';
|
||||
import { database } from '@/data/database';
|
||||
import { addContextToChunk } from '@/services/llm-service';
|
||||
import {
|
||||
DocumentContent,
|
||||
NodeAttributes,
|
||||
MessageAttributes,
|
||||
ChatAttributes,
|
||||
ChannelAttributes,
|
||||
RecordAttributes,
|
||||
DatabaseAttributes,
|
||||
extractBlockTexts,
|
||||
} from '@colanode/core';
|
||||
|
||||
export type ChunkingMetadata = {
|
||||
nodeId: string;
|
||||
type BaseNodeMetadata = {
|
||||
id: string;
|
||||
type: string;
|
||||
name?: string;
|
||||
parentName?: string;
|
||||
spaceName?: string;
|
||||
attributes: NodeAttributes;
|
||||
createdAt: Date;
|
||||
createdBy: string;
|
||||
author?: { id: string; name: string };
|
||||
};
|
||||
|
||||
type MessageNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'message';
|
||||
parentContext?: {
|
||||
type: 'chat' | 'channel';
|
||||
name?: string;
|
||||
collaborators?: Array<{ id: string; name: string }>;
|
||||
};
|
||||
referencedMessage?: {
|
||||
id: string;
|
||||
content: string;
|
||||
author?: { id: string; name: string };
|
||||
};
|
||||
};
|
||||
|
||||
type ChatNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'chat';
|
||||
collaborators: Array<{ id: string; name: string }>;
|
||||
};
|
||||
|
||||
type ChannelNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'channel';
|
||||
collaborators?: Array<{ id: string; name: string }>;
|
||||
};
|
||||
|
||||
type RecordNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'record';
|
||||
databaseName?: string;
|
||||
};
|
||||
|
||||
type DatabaseNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'database';
|
||||
};
|
||||
|
||||
type PageNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'page';
|
||||
parentName?: string;
|
||||
};
|
||||
|
||||
type FileNodeMetadata = BaseNodeMetadata & {
|
||||
nodeType: 'file';
|
||||
};
|
||||
|
||||
type NodeMetadata = {
|
||||
type: 'node';
|
||||
node:
|
||||
| MessageNodeMetadata
|
||||
| ChatNodeMetadata
|
||||
| ChannelNodeMetadata
|
||||
| RecordNodeMetadata
|
||||
| DatabaseNodeMetadata
|
||||
| PageNodeMetadata
|
||||
| FileNodeMetadata;
|
||||
};
|
||||
|
||||
type DocumentMetadata = {
|
||||
type: 'document';
|
||||
document: {
|
||||
id: string;
|
||||
content: DocumentContent;
|
||||
createdAt: Date;
|
||||
nodeType?: string;
|
||||
nodeName?: string;
|
||||
parentNodeName?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type ChunkingMetadata = NodeMetadata | DocumentMetadata;
|
||||
|
||||
export class ChunkingService {
|
||||
// Unified chunkText that optionally enriches the chunk with context metadata.
|
||||
public async chunkText(
|
||||
text: string,
|
||||
metadataInfo?: { type: 'node' | 'document'; id: string }
|
||||
metadata?: { type: 'node' | 'document'; id: string }
|
||||
): Promise<string[]> {
|
||||
const chunkSize = configuration.ai.chunking.defaultChunkSize;
|
||||
const chunkOverlap = configuration.ai.chunking.defaultOverlap;
|
||||
@@ -26,73 +102,324 @@ export class ChunkingService {
|
||||
chunkOverlap,
|
||||
});
|
||||
const docs = await splitter.createDocuments([text]);
|
||||
let chunks = docs
|
||||
.map((doc) => doc.pageContent)
|
||||
.filter((c) => c.trim().length > 10);
|
||||
let chunks = docs.map((doc) => doc.pageContent);
|
||||
chunks = chunks.filter((c) => c.trim().length > 10);
|
||||
if (configuration.ai.chunking.enhanceWithContext) {
|
||||
// Fetch unified metadata (using a single query if possible)
|
||||
const metadata = metadataInfo
|
||||
? await this.fetchMetadata(metadataInfo)
|
||||
: undefined;
|
||||
chunks = await Promise.all(
|
||||
chunks.map(async (chunk) => {
|
||||
return addContextToChunk(chunk, text, metadata);
|
||||
})
|
||||
);
|
||||
const enrichedMetadata = await this.fetchMetadata(metadata);
|
||||
const enriched: string[] = [];
|
||||
for (const chunk of chunks) {
|
||||
const c = await addContextToChunk(chunk, text, enrichedMetadata);
|
||||
enriched.push(c);
|
||||
}
|
||||
return enriched;
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
// A unified metadata fetch which uses a join to gather node and parent (space) details.
|
||||
private async fetchMetadata(info: {
|
||||
private async fetchMetadata(metadata?: {
|
||||
type: 'node' | 'document';
|
||||
id: string;
|
||||
}): Promise<ChunkingMetadata | undefined> {
|
||||
if (info.type === 'node') {
|
||||
// Fetch node along with parent (if exists) and the root (assumed to be the space)
|
||||
const result = await database
|
||||
if (!metadata) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
if (metadata.type === 'node') {
|
||||
const node = await database
|
||||
.selectFrom('nodes')
|
||||
.leftJoin('nodes as parent', 'nodes.parent_id', 'parent.id')
|
||||
.leftJoin('nodes as root', 'nodes.root_id', 'root.id')
|
||||
.select([
|
||||
'nodes.id as nodeId',
|
||||
'nodes.type',
|
||||
"nodes.attributes->>'name' as name",
|
||||
"parent.attributes->>'name' as parentName",
|
||||
"root.attributes->>'name' as spaceName",
|
||||
'nodes.created_at as createdAt',
|
||||
])
|
||||
.where('nodes.id', '=', info.id)
|
||||
.selectAll()
|
||||
.where('id', '=', metadata.id)
|
||||
.executeTakeFirst();
|
||||
if (!result) return undefined;
|
||||
return {
|
||||
nodeId: result.nodeId,
|
||||
type: result.type,
|
||||
name: result.name,
|
||||
parentName: result.parentName,
|
||||
spaceName: result.spaceName,
|
||||
createdAt: result.createdAt,
|
||||
if (!node) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const attributes = node.attributes as NodeAttributes;
|
||||
const author = await database
|
||||
.selectFrom('users')
|
||||
.select(['id', 'name'])
|
||||
.where('id', '=', node.created_by)
|
||||
.executeTakeFirst();
|
||||
|
||||
const baseMetadata: BaseNodeMetadata = {
|
||||
id: node.id,
|
||||
type: attributes.type,
|
||||
attributes,
|
||||
createdAt: node.created_at,
|
||||
createdBy: node.created_by,
|
||||
author: author ?? undefined,
|
||||
};
|
||||
|
||||
switch (attributes.type) {
|
||||
case 'message': {
|
||||
const messageMetadata: MessageNodeMetadata = {
|
||||
...baseMetadata,
|
||||
nodeType: 'message',
|
||||
};
|
||||
|
||||
// If message has a reference, fetch it
|
||||
if (attributes.referenceId) {
|
||||
const referencedNode = await database
|
||||
.selectFrom('nodes')
|
||||
.selectAll()
|
||||
.where('id', '=', attributes.referenceId)
|
||||
.executeTakeFirst();
|
||||
|
||||
if (
|
||||
referencedNode &&
|
||||
referencedNode.attributes.type === 'message'
|
||||
) {
|
||||
const refAttributes =
|
||||
referencedNode.attributes as MessageAttributes;
|
||||
const refAuthor = await database
|
||||
.selectFrom('users')
|
||||
.select(['id', 'name'])
|
||||
.where('id', '=', referencedNode.created_by)
|
||||
.executeTakeFirst();
|
||||
|
||||
messageMetadata.referencedMessage = {
|
||||
id: referencedNode.id,
|
||||
content:
|
||||
extractBlockTexts(referencedNode.id, refAttributes.content) ??
|
||||
'',
|
||||
author: refAuthor ?? undefined,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// Get parent context (chat or channel) if available
|
||||
if (node.parent_id) {
|
||||
const parentNode = await database
|
||||
.selectFrom('nodes')
|
||||
.selectAll()
|
||||
.where('id', '=', node.parent_id)
|
||||
.executeTakeFirst();
|
||||
|
||||
if (parentNode) {
|
||||
switch (parentNode.attributes.type) {
|
||||
case 'chat': {
|
||||
const chatAttributes =
|
||||
parentNode.attributes as ChatAttributes;
|
||||
messageMetadata.parentContext = {
|
||||
type: 'chat',
|
||||
};
|
||||
|
||||
// Fetch chat collaborators
|
||||
if (chatAttributes.collaborators) {
|
||||
const collaborators = await database
|
||||
.selectFrom('users')
|
||||
.select(['id', 'name'])
|
||||
.where(
|
||||
'id',
|
||||
'in',
|
||||
Object.keys(chatAttributes.collaborators)
|
||||
)
|
||||
.execute();
|
||||
messageMetadata.parentContext.collaborators = collaborators;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'channel': {
|
||||
const channelAttributes =
|
||||
parentNode.attributes as ChannelAttributes;
|
||||
messageMetadata.parentContext = {
|
||||
type: 'channel',
|
||||
name: channelAttributes.name,
|
||||
};
|
||||
|
||||
// Fetch channel collaborators if they exist
|
||||
if (
|
||||
'collaborators' in channelAttributes &&
|
||||
channelAttributes.collaborators
|
||||
) {
|
||||
const collaborators = await database
|
||||
.selectFrom('users')
|
||||
.select(['id', 'name'])
|
||||
.where(
|
||||
'id',
|
||||
'in',
|
||||
Object.keys(channelAttributes.collaborators)
|
||||
)
|
||||
.execute();
|
||||
messageMetadata.parentContext.collaborators = collaborators;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { type: 'node', node: messageMetadata };
|
||||
}
|
||||
|
||||
case 'chat': {
|
||||
const chatAttributes = attributes as ChatAttributes;
|
||||
let collaborators: Array<{ id: string; name: string }> = [];
|
||||
if (chatAttributes.collaborators) {
|
||||
collaborators = await database
|
||||
.selectFrom('users')
|
||||
.select(['id', 'name'])
|
||||
.where('id', 'in', Object.keys(chatAttributes.collaborators))
|
||||
.execute();
|
||||
}
|
||||
|
||||
return {
|
||||
type: 'node',
|
||||
node: {
|
||||
...baseMetadata,
|
||||
nodeType: 'chat',
|
||||
collaborators,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
case 'channel': {
|
||||
const channelAttributes = attributes as ChannelAttributes;
|
||||
let collaborators: Array<{ id: string; name: string }> | undefined;
|
||||
|
||||
// Only fetch collaborators if the channel has them
|
||||
if (
|
||||
'collaborators' in channelAttributes &&
|
||||
channelAttributes.collaborators
|
||||
) {
|
||||
collaborators = await database
|
||||
.selectFrom('users')
|
||||
.select(['id', 'name'])
|
||||
.where('id', 'in', Object.keys(channelAttributes.collaborators))
|
||||
.execute();
|
||||
}
|
||||
|
||||
return {
|
||||
type: 'node',
|
||||
node: {
|
||||
...baseMetadata,
|
||||
nodeType: 'channel',
|
||||
collaborators,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
case 'record': {
|
||||
const recordAttributes = attributes as RecordAttributes;
|
||||
const recordMetadata: RecordNodeMetadata = {
|
||||
...baseMetadata,
|
||||
nodeType: 'record',
|
||||
};
|
||||
|
||||
// Fetch database name
|
||||
const databaseNode = await database
|
||||
.selectFrom('nodes')
|
||||
.selectAll()
|
||||
.where('id', '=', recordAttributes.databaseId)
|
||||
.executeTakeFirst();
|
||||
|
||||
if (databaseNode?.attributes.type === 'database') {
|
||||
const dbAttributes = databaseNode.attributes as DatabaseAttributes;
|
||||
recordMetadata.databaseName = dbAttributes.name;
|
||||
}
|
||||
|
||||
return { type: 'node', node: recordMetadata };
|
||||
}
|
||||
|
||||
case 'database': {
|
||||
return {
|
||||
type: 'node',
|
||||
node: {
|
||||
...baseMetadata,
|
||||
nodeType: 'database',
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
case 'page': {
|
||||
const pageMetadata: PageNodeMetadata = {
|
||||
...baseMetadata,
|
||||
nodeType: 'page',
|
||||
};
|
||||
|
||||
// Get parent folder/space name for context
|
||||
if (node.parent_id) {
|
||||
const parentNode = await database
|
||||
.selectFrom('nodes')
|
||||
.selectAll()
|
||||
.where('id', '=', node.parent_id)
|
||||
.executeTakeFirst();
|
||||
|
||||
if (parentNode && 'name' in parentNode.attributes) {
|
||||
pageMetadata.parentName = parentNode.attributes.name;
|
||||
}
|
||||
}
|
||||
|
||||
return { type: 'node', node: pageMetadata };
|
||||
}
|
||||
|
||||
case 'file': {
|
||||
return {
|
||||
type: 'node',
|
||||
node: {
|
||||
...baseMetadata,
|
||||
nodeType: 'file',
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
default:
|
||||
return {
|
||||
type: 'node',
|
||||
node: {
|
||||
...baseMetadata,
|
||||
nodeType: attributes.type as any,
|
||||
},
|
||||
};
|
||||
}
|
||||
} else {
|
||||
// For documents, assume similar metadata based on associated node.
|
||||
const result = await database
|
||||
// For documents, fetch both document and its associated node
|
||||
const document = await database
|
||||
.selectFrom('documents')
|
||||
.innerJoin('nodes', 'documents.id', 'nodes.id')
|
||||
.select([
|
||||
'nodes.id as nodeId',
|
||||
'nodes.type',
|
||||
"nodes.attributes->>'name' as name",
|
||||
'nodes.created_at as createdAt',
|
||||
])
|
||||
.where('documents.id', '=', info.id)
|
||||
.selectAll()
|
||||
.where('id', '=', metadata.id)
|
||||
.executeTakeFirst();
|
||||
if (!result) return undefined;
|
||||
return {
|
||||
nodeId: result.nodeId,
|
||||
type: result.type,
|
||||
name: result.name,
|
||||
createdAt: result.createdAt,
|
||||
if (!document) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const documentMetadata: DocumentMetadata = {
|
||||
type: 'document',
|
||||
document: {
|
||||
id: document.id,
|
||||
content: document.content,
|
||||
createdAt: document.created_at,
|
||||
},
|
||||
};
|
||||
|
||||
// Try to fetch associated node for additional context
|
||||
const node = await database
|
||||
.selectFrom('nodes')
|
||||
.selectAll()
|
||||
.where('id', '=', document.id)
|
||||
.executeTakeFirst();
|
||||
|
||||
if (node) {
|
||||
documentMetadata.document.nodeType = node.attributes.type;
|
||||
if ('name' in node.attributes) {
|
||||
documentMetadata.document.nodeName = node.attributes.name;
|
||||
}
|
||||
|
||||
if (node.parent_id) {
|
||||
const parentNode = await database
|
||||
.selectFrom('nodes')
|
||||
.selectAll()
|
||||
.where('id', '=', node.parent_id)
|
||||
.executeTakeFirst();
|
||||
|
||||
if (parentNode && 'name' in parentNode.attributes) {
|
||||
documentMetadata.document.parentNodeName =
|
||||
parentNode.attributes.name;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return documentMetadata;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user