Hi everyone, greetings from a student who currently exploring LLMs chatbot. Currently need some help from the langchain community for some insights or guides, as i run out of idea and couldn’t figure out what happen to my chatbot’s generated response. I will share my code here:
import os
import re
import time
import streamlit as st
from io import BytesIO
import pdfplumber
from docx import Document as docxDoc
from pptx import Presentation
import openpyxl
import pandas as pd
import csv
from langchain_community.llms.gpt4all import GPT4All
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.documents import Document
# |=================HELPER FUNCTION===================|
def bot_reply(bot_chain, user_question, rag=None):
input_dict = {
"content": rag or "",
"question": user_question
}
print(f"🚩Debug: {input_dict}")
accumulate = ''
response = bot_chain.invoke(input_dict)
print(f"🚩Debug - Recieved Response: {response}")
sentences = re.split(r"(?<=[.!?]) +", response)
for word in sentences:
accumulate += word + " "
yield accumulate
time.sleep(0.05)
def make_chunks(corpus):
docs = []
for data in corpus:
docs.append(Document(
page_content=data['content'],
metadata={
"page": data['page']
}
))
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
chunks = text_splitter.split_documents(docs)
return chunks
def pdf_extractor(uploaded_file):
file_bytes = uploaded_file.read()
pdf_stream = BytesIO(file_bytes)
with pdfplumber.open(pdf_stream) as attachment:
data = []
for page in attachment.pages:
pg_num = page.page_number
content = page.extract_text() or ""
if content:
data.append({
"page": pg_num,
"content": content
})
return data
def docx_extractor(uploaded_file):
doc = docxDoc(uploaded_file)
data = []
incre = 0
for para in doc.paragraphs:
incre += 1
if para.text.strip():
data.append({
"page": incre,
"content": para.text.strip()
})
return data
def pptx_extractor(uploaded_file):
ppt = Presentation(uploaded_file)
data = []
incre = 0
for slide in ppt.slides:
slide_text = []
incre += 1
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_text.append(shape.text.strip())
raw = " ".join(slide_text)
clean = re.sub(r'\s+', ' ', raw).strip()
if slide_text:
data.append({
"page": incre,
"content": clean
})
return data
model = GPT4All(
model="Llama-3.2-3B-Instruct-Q4_0.gguf",
temp=0.75,
top_p=1,
verbose=False,
max_tokens=2048,
allow_download=False,
device="cpu"
)
bot_template = SystemMessagePromptTemplate.from_template("""
You are the personal AI assistant.
Answer question and keep your response clear and precise.
If applicable, structure your response in point form.
""")
human_template = HumanMessagePromptTemplate.from_template("""
{content}
Question: {question}
""")
prompt = ChatPromptTemplate.from_messages([bot_template, human_template])
chain = prompt | model
# |=================TITAN UI===================|
st.header("Hello :orange[Yong Ming]!")
st.subheader("Start chating with :orange[TITAN]!")
with st.sidebar:
st.subheader("Continue to our previous chat")
# initialize chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# preserve history when coming back or rerun
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input(
placeholder="Write Message Here...",
accept_file=True,
file_type=['pdf', 'txt', 'docx', 'pptx', 'xlsx'],
):
text = prompt.text or ""
files = prompt.files or []
with st.chat_message("human"):
if text:
st.markdown(text)
if files:
st.markdown(f"🧷**My Attachment:** {files[0].name}")
st.session_state.messages.append({
"role": "human",
"content": text
})
rag_context = None
if files:
target_dir = 'C:/Users/ASUS/OneDrive/Desktop/documented_chatbot/attachments/'
if not os.path.exists(target_dir):
os.makedirs(target_dir, exist_ok=True)
uploaded_file = files[0]
saved_attachment = os.path.join(target_dir, uploaded_file.name)
with open(saved_attachment, 'wb') as f:
f.write(uploaded_file.getbuffer())
# route to decide extractor
if uploaded_file.name.endswith(".pdf"):
data = pdf_extractor(uploaded_file)
elif uploaded_file.name.endswith(".docx"):
data = docx_extractor(uploaded_file)
elif uploaded_file.name.endswith(".pptx"):
data = pptx_extractor(uploaded_file)
else:
data = []
chunks = make_chunks(data)
rag_context = "\n\n".join([c.page_content for c in chunks])
with st.chat_message("ai"):
placeholder = st.empty()
ai_msg = ""
for chunk in bot_reply(chain, text, rag_context):
placeholder.markdown(chunk)
ai_msg = chunk
st.session_state.messages.append({
"role": "ai",
"content": ai_msg
})
print(f"🚩Debug: {ai_msg}")
Most of the time, the responses seems to be cut off, and it sometime start with weird opening like picture shown below:
I hope to get clearer explaination on my curious, as i assume something from my backend cause it. And what i should take note while dealing with response cut off?
Very appreciate to anyone who explain and provide help. Thank you so much!
