Panel newbie here. I have written the following Panel application for an LLM to query on a vector database:
import os, dotenv, openai, panel
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
panel.extension()
# Set API key
dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
openai.api_key = OPENAI_API_KEY
@panel.cache
def load_vectorstore():
# If the vector embeddings of the documents have not been created
if not os.path.isfile('chroma_db/chroma.sqlite3'):
# Load the documents
loader = DirectoryLoader('Docs/', glob="./*.pdf", loader_cls=PyPDFLoader)
data = loader.load()
# Split the docs into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=50
)
docs = splitter.split_documents(data)
# Embed the documents and store them in a Chroma DB
embedding=OpenAIEmbeddings(openai_api_key = openai.api_key)
vectorstore = Chroma.from_documents(documents=docs,embedding=embedding, persist_directory="./chroma_db")
else:
# load ChromaDB from disk
embedding=OpenAIEmbeddings(openai_api_key = openai.api_key)
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding)
return vectorstore
def retrieval_qa_chain():
# Define prompt template
template = """
Provide your answers to the best of your ability to the user's questions.
## Task Context and History
- **Context**: {context}
- **Chat History**: {history}
- **User Question**: {question}
## Answer Template
Keep your explanations concise and to the point.
"""
prompt = PromptTemplate(
input_variables=["history", "context", "question"],
template=template,
)
memory = ConversationBufferMemory(
memory_key="history",
input_key="question"
)
llm = ChatOpenAI(temperature=0,
model="gpt-4-1106-preview",
openai_api_key = openai.api_key,
streaming=True
)
vectorstore = load_vectorstore()
qa_chain = RetrievalQA.from_chain_type(llm,
chain_type='stuff',
retriever=vectorstore.as_retriever(),
chain_type_kwargs={
"prompt": prompt,
"memory": memory
})
return qa_chain
async def respond(contents, user, chat_interface):
qa = retrieval_qa_chain()
response = qa({"query": contents})
answers = panel.Column(response["result"])
yield {"user": "Bot", "value": answers}
chat_interface = panel.chat.ChatInterface(
callback=respond, sizing_mode="stretch_width", callback_exception='verbose'
)
chat_interface.send(
{"user": "Bot", "value": '''Ask me any question.'''},
respond=False,
)
template = panel.template.BootstrapTemplate(main=[chat_interface])
template.servable()
It works, but the response from the LLM is deplayed at once. I want to be able to stream the response. How do I do that?
I tried using panel.chat.langchain.PanelCallbackHandler
, but that introduces other artefacts in the chat response like source documents and changes the name of the chatbot too. I don’t want any of that - I just want my LLM responses to be streamed instead of displayed at once at the end. Is there a simple way to do that in Panel?