Prevent summarization from polluting message stream

Hi! I was wondering what the suggested approach is to have multiple model calls in a node where I only want to yield messages of a certain model to the stream. I have this setup here where I have a summarization call before I make the actual model call following the tutorial from langmem. Since I am using streamMode: "messages" every model result gets streamed to my client even the one inside summarize_messages. Unfortunately the result is that the intermediate message resulting from the summarization model call gets added to message stream causing the react useStream client to show the message. However, I want to keep that message hidden since it is not part of the actual message list. I tried doing this in a separate node, but this also doesn’t solve my problem because I’m using the React SDK and there I can filter by node or tags. I’m not sure if how to deal with this without writing my own implementation of the client, so any advice or hint towards relevant documentation would be highly appreciated.


langgraph server:

from datetime import datetime
from typing import Annotated

from langchain.chat_models import init_chat_model
from langchain_core.messages import (
    BaseMessage,
    SystemMessage,
)
from langgraph.graph import START, StateGraph, add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from langmem.short_term import RunningSummary, summarize_messages
from typing_extensions import TypedDict

from graphs.config import GraphConfig
from graphs.prompts import (
    SYSTEM_PROMPT,
)
from utils.utils import get_model_name


class State(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]
    summary: RunningSummary | None


tools = [...]


def call_model(state: State, config: GraphConfig) -> State:
    system_prompt = config.get("configurable", {}).get(
        "system_prompt",
        SYSTEM_PROMPT.format(current_date=datetime.now().strftime("%Y-%m-%d")),
    )
    system_prompt_message = SystemMessage(content=system_prompt)
    messages = state["messages"]

    summary_model = init_chat_model(
        model="google_genai:gemini-2.5-flash",
        temperature=config.get("configurable", {}).get("temperature", 0.6),
        disable_streaming=True,
        tags=["summarizer"],
    )

    # summarize the messages
    summarization_result = summarize_messages(
        messages,
        running_summary=state.get("summary"),
        token_counter=summary_model.get_num_tokens_from_messages,
        model=summary_model,
        max_tokens=12000,
        max_tokens_before_summary=8000,
        max_summary_tokens=2000,
    )

    # initialize the actual model call
    model_kwargs = {
        "model": get_model_name(config),
        "temperature": config.get("configurable", {}).get("temperature", 0.6),
    }
    if "max_output_tokens" in config.get("configurable", {}):
        model_kwargs["max_tokens"] = config.get("configurable", {})["max_output_tokens"]

    model = init_chat_model(**model_kwargs)

    model_with_tools = model.bind_tools(tools)
    response = model_with_tools.invoke(summarization_result.messages)


    state_update = {"messages": [response]}
    if summarization_result.running_summary:
        state_update["summary"] = summarization_result.running_summary

    return state_update

tool_node = ToolNode(tools=tools)

# Define the graph
graph = (
    StateGraph(state_schema=State, config_schema=GraphConfig)
    .add_node("model", call_model)
    .add_node("tools", tool_node)
    .add_edge(START, "model")
    .add_edge("tools", "model")
    .add_conditional_edges("model", tools_condition)
    .compile(name="Graph")
)

Dependencies:

langsmith>=0.4.4
langgraph>=0.5.1
langgraph-cli>=0.3.3
langgraph-cli[inmem]>=0.3.3
langchain[openai]>=0.3.25
langchain[google-genai]>=0.3.25
langchain[anthropic]>=0.3.25
pytest>=8.4.0
requests>=2.32.4
PyJWT[crypto]>=2.8.0
langchain-groq>=0.3.6
langmem>=0.0.28

client side (heavily simplified):

const StreamContext = createContext<StreamContextType | undefined>(undefined);

const StreamSession = 
  ({
    children,
    apiUrl,
    assistantId,
  }: {
    children: ReactNode;
    apiUrl: string;
    assistantId: string;
  }) => {

    const streamValue = useStream({
      apiUrl,
      assistantId,
      reconnectOnMount: true,
      threadId,
      initialValues,
      messagesKey: "messages",
      defaultHeaders: {
        Authorization: `Bearer ${token}`,
      },
    });

    return (
      <StreamContext.Provider value={streamValue}>
        {children}
      </StreamContext.Provider>
    );
  }

export const StreamProvider: React.FC<{ children: ReactNode }> = ({
  children,
}) => {
  return (
    <StreamSession apiUrl={apiUrl} assistantId={assistantId}>
      {children}
    </StreamSession>
  );
};

// Create a custom hook to use the context
export const useStreamContext = (): StreamContextType => {
  const context = useContext(StreamContext);
  if (context === undefined) {
    throw new Error("useStreamContext must be used within a StreamProvider");
  }
  return context;
};

export default StreamContext;

export const Thread = function Thread({
  onClose,
  setMaxWidth = false,
}: ThreadProps) {
  const stream = useStreamContext();
  const messages = stream.messages;
  const isLoading = stream.isLoading;

  const handleSubmit = async (
    e: FormEvent,
    deviceId: string,
    orgId: string,
    userId: string
  ) => {
    e.preventDefault();

    // get input from form event
    const messageText = e.target.value;

    const newMessage: Message = {
      id: uuidv4(),
      type: "human",
      content: [{ type: "text", text: messageText }],
    };

    stream.submit(
      { messages: [...stream.messages, newMessage] },
      {
        metadata: {
          organization_id: orgId,
          device_id: deviceId,
          user_id: userId,
        },
        config: {
          configurable: {},
        },
        streamMode: ["messages"],
        optimisticValues: (prev) => ({
          ...prev,
          messages: [...(prev.messages ?? []), ...stream.messages, newMessage],
        }),
      }
    );
  };

  return (
    <StickToBottom>
      <StickyToBottomContent
        content={messages.map((message, index) =>
          message.type === "human" ? (
            <HumanMessage
              key={message.id || `${message.type}-${index}`}
              message={message}
              isLoading={isLoading}
              handleRegenerate={handleRegenerate}
            />
          ) : (
            <AssistantMessage
              key={message.id || `${message.type}-${index}`}
              message={message}
              isLoading={isLoading}
              handleRegenerate={handleRegenerate}
            />
          )
        )}
        footer={<ChatInput input={input} onSubmit={handleSubmit} />}
      />
    </StickToBottom>
  );
};

function ChatUI({
  onClose,
  setMaxWidth,
}: {
  onClose?: () => void;
  setMaxWidth?: boolean;
}) {
  return (
    <StreamProvider>
      <Thread onClose={onClose} />
    </StreamProvider>
  );
}

Dependencies:

"@langchain/core": "^0.3.66",
"@langchain/langgraph": "^0.3.11",
"@langchain/langgraph-sdk": "^0.0.100",

I have the same issue. Every LLM call gets streamed to user and then hides. I am finding a way to suppress intermediate output but nothing worked so far.

Similar challenge, I can not discern how to filter llm tokens from within a tool call to redirect in my react app.

Hi,

You can do one trick, use a separate model for summary and when initialising the model you can say disable_streaming=True

summary_model = init_chat_model(

model= “gpt-4xx”,

disable_streaming=True # (1)!

)

As mentioned here