diff --git a/backend/open_webui/utils/middleware.py b/backend/open_webui/utils/middleware.py index 361f2158cd..f2d472631a 100644 --- a/backend/open_webui/utils/middleware.py +++ b/backend/open_webui/utils/middleware.py @@ -2165,7 +2165,27 @@ async def load_messages_from_db(chat_id: str, message_id: str) -> Optional[list[ return [{k: v for k, v in msg.items() if k in ('role', 'content', 'output', 'files')} for msg in db_messages] -def process_messages_with_output(messages: list[dict]) -> list[dict]: +def get_reasoning_format(model: dict) -> str | None: + """ + Determine how reasoning should be included in reconstructed messages. + + Returns: + 'think_tags': Ollama expects tags in content. + 'reasoning_content': llama.cpp supports reasoning_content as a top-level field. + None: skip reasoning (safe default for strict providers). + """ + provider = model.get('provider', '') + if provider == 'ollama': + return 'think_tags' + if provider == 'llama.cpp': + return 'reasoning_content' + return None + + +def process_messages_with_output( + messages: list[dict], + reasoning_format: str | None = None, +) -> list[dict]: """ Process messages with OR-aligned output items for LLM consumption. @@ -2177,7 +2197,9 @@ def process_messages_with_output(messages: list[dict]) -> list[dict]: for message in messages: if message.get('role') == 'assistant' and message.get('output'): # Use output items for clean OpenAI-format messages - output_messages = convert_output_to_messages(message['output'], raw=True) + output_messages = convert_output_to_messages( + message['output'], raw=True, reasoning_format=reasoning_format, + ) if output_messages: processed.extend(output_messages) continue @@ -2315,7 +2337,10 @@ async def process_chat_payload(request, form_data, user, metadata, model): form_data['messages'].append({'role': 'user', 'content': regeneration_prompt}) # Process messages with OR-aligned output items for clean LLM messages - form_data['messages'] = process_messages_with_output(form_data.get('messages', [])) + form_data['messages'] = process_messages_with_output( + form_data.get('messages', []), + reasoning_format=get_reasoning_format(model), + ) system_message = get_system_message(form_data.get('messages', [])) if system_message: # Chat Controls/User Settings @@ -4741,10 +4766,10 @@ async def streaming_chat_response_handler(response, ctx): system_message = get_system_message(form_data['messages']) new_form_data['messages'] = ( [system_message] if system_message else [] - ) + convert_output_to_messages(output, raw=True) + ) + convert_output_to_messages(output, raw=True, reasoning_format=get_reasoning_format(model)) new_form_data['previous_response_id'] = last_response_id else: - tool_messages = convert_output_to_messages(output, raw=True) + tool_messages = convert_output_to_messages(output, raw=True, reasoning_format=get_reasoning_format(model)) # Chat Completions providers don't support multimodal # tool messages. Extract images into a user message. @@ -4964,7 +4989,7 @@ async def streaming_chat_response_handler(response, ctx): 'metadata': metadata, 'messages': [ *form_data['messages'], - *convert_output_to_messages(output, raw=True), + *convert_output_to_messages(output, raw=True, reasoning_format=get_reasoning_format(model)), ], } diff --git a/backend/open_webui/utils/misc.py b/backend/open_webui/utils/misc.py index dec5dce94c..b6df292890 100644 --- a/backend/open_webui/utils/misc.py +++ b/backend/open_webui/utils/misc.py @@ -129,7 +129,11 @@ def get_content_from_message(message: dict) -> Optional[str]: return None -def convert_output_to_messages(output: list, raw: bool = False) -> list[dict]: +def convert_output_to_messages( + output: list, + raw: bool = False, + reasoning_format: str | None = None, +) -> list[dict]: """ Convert OR-aligned output items to OpenAI Chat Completion-format messages. @@ -139,8 +143,14 @@ def convert_output_to_messages(output: list, raw: bool = False) -> list[dict]: Args: output: List of OR-aligned output items (Responses API format). - raw: If True, include reasoning blocks (with original tags) and code - interpreter blocks for LLM re-processing follow-ups. + raw: If True, include code interpreter blocks for LLM re-processing + follow-ups. + reasoning_format: How to include reasoning blocks in the output: + - None: skip reasoning (default, safe for strict providers). + - ``'think_tags'``: wrap in ```` tags inside content + (for Ollama, which expects reasoning as tagged content). + - ``'reasoning_content'``: set as ``reasoning_content`` top-level field + (for llama.cpp, which routes it via the chat template). """ if not output or not isinstance(output, list): return [] @@ -148,19 +158,26 @@ def convert_output_to_messages(output: list, raw: bool = False) -> list[dict]: messages = [] pending_tool_calls = [] pending_content = [] + pending_reasoning = [] # Only populated when reasoning_format == 'reasoning_content' def flush_pending(): - nonlocal pending_content, pending_tool_calls - if pending_content or pending_tool_calls: - messages.append( - { - 'role': 'assistant', - 'content': '\n'.join(pending_content) if pending_content else '', - **({'tool_calls': pending_tool_calls} if pending_tool_calls else {}), - } - ) - pending_content = [] - pending_tool_calls = [] + nonlocal pending_content, pending_tool_calls, pending_reasoning + if not pending_content and not pending_tool_calls and not pending_reasoning: + return + + message = { + 'role': 'assistant', + 'content': '\n'.join(pending_content) if pending_content else '', + **({'tool_calls': pending_tool_calls} if pending_tool_calls else {}), + } + + if pending_reasoning: + message['reasoning_content'] = '\n'.join(pending_reasoning) + + messages.append(message) + pending_content = [] + pending_tool_calls = [] + pending_reasoning = [] for item in output: item_type = item.get('type', '') @@ -231,27 +248,26 @@ def convert_output_to_messages(output: list, raw: bool = False) -> list[dict]: ) elif item_type == 'reasoning': - if raw: - # Include reasoning with original tags for LLM re-processing - reasoning_text = '' - source_list = item.get('summary', []) or item.get('content', []) - for part in source_list: - if part.get('type') == 'output_text': - reasoning_text += part.get('text', '') - elif 'text' in part: - reasoning_text += part.get('text', '') + if not reasoning_format: + continue - if reasoning_text: + reasoning_text = '' + source_list = item.get('summary', []) or item.get('content', []) + for part in source_list: + if part.get('type') == 'output_text': + reasoning_text += part.get('text', '') + elif 'text' in part: + reasoning_text += part.get('text', '') + + if reasoning_text: + if reasoning_format == 'think_tags': + # Ollama: embed in content with the item's original tags start_tag = item.get('start_tag', '') end_tag = item.get('end_tag', '') pending_content.append(f'{start_tag}{reasoning_text}{end_tag}') - # NOTE: Some providers (e.g. Moonshot/Kimi K2.5) require - # reasoning_content as a top-level field on assistant - # messages. This should be handled externally via a - # pipeline filter or connection-level middleware, not - # here — adding it universally breaks strict providers - # (OpenAI, Vertex AI, Azure) that reject unknown fields. - # else: skip reasoning blocks for normal LLM messages + elif reasoning_format == 'reasoning_content': + # llama.cpp: collect for reasoning_content field + pending_reasoning.append(reasoning_text) elif item_type == 'open_webui:code_interpreter': # Always include code interpreter content so the LLM knows