Prevent summarization from polluting message stream

Hi! I was wondering what the suggested approach is to have multiple model calls in a node where I only want to yield messages of a certain model to the stream. I have this setup here where I have a summarization call before I make the actual model call following the tutorial from langmem. Since I am using streamMode: "messages" every model result gets streamed to my client even the one inside summarize_messages. Unfortunately the result is that the intermediate message resulting from the summarization model call gets added to message stream causing the react useStream client to show the message. However, I want to keep that message hidden since it is not part of the actual message list. I tried doing this in a separate node, but this also doesn’t solve my problem because I’m using the React SDK and there I can filter by node or tags. I’m not sure if how to deal with this without writing my own implementation of the client, so any advice or hint towards relevant documentation would be highly appreciated.


langgraph server:

from datetime import datetime
from typing import Annotated

from langchain.chat_models import init_chat_model
from langchain_core.messages import (
    BaseMessage,
    SystemMessage,
)
from langgraph.graph import START, StateGraph, add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from langmem.short_term import RunningSummary, summarize_messages
from typing_extensions import TypedDict

from graphs.config import GraphConfig
from graphs.prompts import (
    SYSTEM_PROMPT,
)
from utils.utils import get_model_name


class State(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]
    summary: RunningSummary | None


tools = [...]


def call_model(state: State, config: GraphConfig) -> State:
    system_prompt = config.get("configurable", {}).get(
        "system_prompt",
        SYSTEM_PROMPT.format(current_date=datetime.now().strftime("%Y-%m-%d")),
    )
    system_prompt_message = SystemMessage(content=system_prompt)
    messages = state["messages"]

    summary_model = init_chat_model(
        model="google_genai:gemini-2.5-flash",
        temperature=config.get("configurable", {}).get("temperature", 0.6),
        disable_streaming=True,
        tags=["summarizer"],
    )

    # summarize the messages
    summarization_result = summarize_messages(
        messages,
        running_summary=state.get("summary"),
        token_counter=summary_model.get_num_tokens_from_messages,
        model=summary_model,
        max_tokens=12000,
        max_tokens_before_summary=8000,
        max_summary_tokens=2000,
    )

    # initialize the actual model call
    model_kwargs = {
        "model": get_model_name(config),
        "temperature": config.get("configurable", {}).get("temperature", 0.6),
    }
    if "max_output_tokens" in config.get("configurable", {}):
        model_kwargs["max_tokens"] = config.get("configurable", {})["max_output_tokens"]

    model = init_chat_model(**model_kwargs)

    model_with_tools = model.bind_tools(tools)
    response = model_with_tools.invoke(summarization_result.messages)


    state_update = {"messages": [response]}
    if summarization_result.running_summary:
        state_update["summary"] = summarization_result.running_summary

    return state_update

tool_node = ToolNode(tools=tools)

# Define the graph
graph = (
    StateGraph(state_schema=State, config_schema=GraphConfig)
    .add_node("model", call_model)
    .add_node("tools", tool_node)
    .add_edge(START, "model")
    .add_edge("tools", "model")
    .add_conditional_edges("model", tools_condition)
    .compile(name="Graph")
)

Dependencies:

langsmith>=0.4.4
langgraph>=0.5.1
langgraph-cli>=0.3.3
langgraph-cli[inmem]>=0.3.3
langchain[openai]>=0.3.25
langchain[google-genai]>=0.3.25
langchain[anthropic]>=0.3.25
pytest>=8.4.0
requests>=2.32.4
PyJWT[crypto]>=2.8.0
langchain-groq>=0.3.6
langmem>=0.0.28

client side (heavily simplified):

const StreamContext = createContext<StreamContextType | undefined>(undefined);

const StreamSession = 
  ({
    children,
    apiUrl,
    assistantId,
  }: {
    children: ReactNode;
    apiUrl: string;
    assistantId: string;
  }) => {

    const streamValue = useStream({
      apiUrl,
      assistantId,
      reconnectOnMount: true,
      threadId,
      initialValues,
      messagesKey: "messages",
      defaultHeaders: {
        Authorization: `Bearer ${token}`,
      },
    });

    return (
      <StreamContext.Provider value={streamValue}>
        {children}
      </StreamContext.Provider>
    );
  }

export const StreamProvider: React.FC<{ children: ReactNode }> = ({
  children,
}) => {
  return (
    <StreamSession apiUrl={apiUrl} assistantId={assistantId}>
      {children}
    </StreamSession>
  );
};

// Create a custom hook to use the context
export const useStreamContext = (): StreamContextType => {
  const context = useContext(StreamContext);
  if (context === undefined) {
    throw new Error("useStreamContext must be used within a StreamProvider");
  }
  return context;
};

export default StreamContext;

export const Thread = function Thread({
  onClose,
  setMaxWidth = false,
}: ThreadProps) {
  const stream = useStreamContext();
  const messages = stream.messages;
  const isLoading = stream.isLoading;

  const handleSubmit = async (
    e: FormEvent,
    deviceId: string,
    orgId: string,
    userId: string
  ) => {
    e.preventDefault();

    // get input from form event
    const messageText = e.target.value;

    const newMessage: Message = {
      id: uuidv4(),
      type: "human",
      content: [{ type: "text", text: messageText }],
    };

    stream.submit(
      { messages: [...stream.messages, newMessage] },
      {
        metadata: {
          organization_id: orgId,
          device_id: deviceId,
          user_id: userId,
        },
        config: {
          configurable: {},
        },
        streamMode: ["messages"],
        optimisticValues: (prev) => ({
          ...prev,
          messages: [...(prev.messages ?? []), ...stream.messages, newMessage],
        }),
      }
    );
  };

  return (
    <StickToBottom>
      <StickyToBottomContent
        content={messages.map((message, index) =>
          message.type === "human" ? (
            <HumanMessage
              key={message.id || `${message.type}-${index}`}
              message={message}
              isLoading={isLoading}
              handleRegenerate={handleRegenerate}
            />
          ) : (
            <AssistantMessage
              key={message.id || `${message.type}-${index}`}
              message={message}
              isLoading={isLoading}
              handleRegenerate={handleRegenerate}
            />
          )
        )}
        footer={<ChatInput input={input} onSubmit={handleSubmit} />}
      />
    </StickToBottom>
  );
};

function ChatUI({
  onClose,
  setMaxWidth,
}: {
  onClose?: () => void;
  setMaxWidth?: boolean;
}) {
  return (
    <StreamProvider>
      <Thread onClose={onClose} />
    </StreamProvider>
  );
}

Dependencies:

"@langchain/core": "^0.3.66",
"@langchain/langgraph": "^0.3.11",
"@langchain/langgraph-sdk": "^0.0.100",

I have the same issue. Every LLM call gets streamed to user and then hides. I am finding a way to suppress intermediate output but nothing worked so far.

Similar challenge, I can not discern how to filter llm tokens from within a tool call to redirect in my react app.

Hi,

You can do one trick, use a separate model for summary and when initialising the model you can say disable_streaming=True

summary_model = init_chat_model(

model= “gpt-4xx”,

disable_streaming=True # (1)!

)

As mentioned here

tried using separate model but still same issue

edit:-
ok so after hours of debug i finally solved it i first tried in docs and then this separate model method with summary off but nothing worked then finally i tried to resolve it by looking what am i getting back in stream
so i found i ma getting this back in stream

1] === FULL TOKEN ===
[1] Token type: AIMessage
[1] Token ID: sometokenid
[1] Token kwargs: undefined
[1] Response metadata: {
[1]   tokenUsage: { promptTokens: 919, completionTokens: 23, totalTokens: 942 },
[1]   finish_reason: ‘stop’,
[1]   model_provider: ‘openai’,
[1]   model_name: ‘gpt-3.5-turbo-0125’
[1] }
[1] Model provider: openai
[1] Content: The user asked for help in understanding Data Structures and Algorithms (DSA), which is crucial for coding and software development.
[1] ==================

[1] === FULL TOKEN ===
[1] Token type: RemoveMessage
[1] Token ID: remove_all
[1] Token kwargs: undefined
[1] Response metadata: {}
[1] Model provider: undefined
[1] Content: 

[1] ==================

=== FULL TOKEN ===
[1] Token type: HumanMessage
[1] Token ID: sometokenid
[1] Token kwargs: undefined
[1] Response metadata: {}
[1] Model provider: undefined
[1] Content: Here is a summary of the conversation to date:
[1]
[1] The user asked for help in understanding Data Structures and Algorithms (DSA), which is crucial for coding and software development.
[1] ==================

[1] === FULL TOKEN ===
[1] Token type: AIMessageChunk
[1] Token ID: sometokenid
[1] Token kwargs: undefined
[1] Response metadata: { model_provider: ‘openai’, usage: {} }
[1] Model provider: openai
[1] Content:
[1] ==================
[1]
[1] === FULL TOKEN ===
[1] Token type: AIMessageChunk
[1] Token ID: sometokenid
[1] Token kwargs: undefined
[1] Response metadata: { model_provider: ‘openai’, usage: {} }
[1] Model provider: openai
[1] Content: You’re
[1] ==================
[1]
[1] === FULL TOKEN ===
[1] Token type: AIMessageChunk
[1] Token ID: sometokenid
[1] Token kwargs: undefined
[1] Response metadata: { model_provider: ‘openai’, usage: {} }
[1] Model provider: openai
[1] Content:  very
[1] ==================

our proper response comes with token type Token type: AIMessageChunk so i skipped others

Example:-

for await (const [token] of stream) {
// Skip summary-related messages
const tokenType = token.constructor.name;

if (tokenType === “RemoveMessage” || tokenType === “HumanMessage”) {
console.log(“Skipping token type:”, tokenType);
continue; // Skip RemoveMessage and HumanMessage (summary) tokens
}

// Only process AIMessageChunk tokens
if (tokenType !== “AIMessageChunk”) {
continue;
}

const blocks = token.contentBlocks;
if (!blocks || !Array.isArray(blocks)) continue;

for (let i = 0; i < blocks.length; i++) {
const block = blocks[i];
if (block.type === “text”) {
const text = (block as any).text;
if (text) res.write(text);
} else if (block.type === “tool_call_chunk”) {
const name = (block as any).name;
if (name) res.write(\n[TOOL_CALL: ${name}]\n);
}
}
}

and it worked fine :sweat_smile: .
If I am doing some thing wrong please guide me .
Thnx

1 Like

hi guys,

imho: streamMode “messages” streams tokens from every chat model invocation inside your graph, even when you call .invoke, not just your “final” model. That’s why your summarizer’s output appears in the UI. You have three practical options:

  • Tag-and-filter on the client (recommended for your setup)

    • Add tags to the LLM you want visible (e.g., tags=[“assistant”]) and a different tag to the summarizer (e.g., tags=[“summarizer”]).
    • In React, don’t render stream.messages directly. Instead, build your own visible message list by listening to events and filtering by metadata.tags or metadata.langgraph_node. With the React SDK, use onLangChainEvent to receive on_chat_model_stream/on_chat_model_end and ignore any with tags that include “summarizer”. See docs on filtering by tags and node:
      • Streaming guide - Filter by LLM invocation using tags
      • Streaming guide - Filter by node using metadata.langgraph_node
      • React useStream - it builds messages from messages-tuple under the hood; use onLangChainEvent to implement your filter
      • Cloud streaming - messages-tuple mode and filtering notes
  • Disable streaming for the summarizer model

    • You’re already doing disable_streaming=True. This suppresses token streaming, but message events can still be emitted on model end in messages mode. Tagging + client-side filtering is still the robust way to keep those internal outputs out of your UI list.
1 Like

@pawel-twardziak I’m aware it’s possible to filter messages on the frontend, and also I can use .with_config({"tags": [TAG_NOSTREAM]}) to prevent a chat model streaming to the frontend. But nonetheless, if my langgraph nodes output a BaseMessage anywhere in the state (seemingly even if it’s not in the “public” messages key), this message will show up in the “messages” stream mode. This seems the case even if this is in a part of the state that is not included in the graph’s Output Schema. Is there a way to stop this automatic emiting of instances of BaseMessage when they’re put into the state by a node in my graph? (I’m on langgraph 0.4.10 and upgrading isn’t really feasible at the moment, but please do let me know if this behaviour has changed in newer versions too)