import gradio as gr from openai import OpenAI # Initialize the OpenAI client client = OpenAI( base_url='https://hub.societyai.com/models/llama-3-2-3b/openai/v1', ) with gr.Blocks() as demo: chatbot = gr.Chatbot(type="messages") msg = gr.Textbox() clear = gr.Button("Clear") def chat(user_message, history: list): """ 1) Appends the user message to the conversation history 2) Sends the conversation history to the vLLM API 3) Streams back the assistant's response """ # 1) Append the user message history = history + [{"role": "user", "content": user_message}] # 2) Prepare an empty assistant entry for streaming history.append({"role": "assistant", "content": ""}) # 3) Stream the assistant's response try: completion = client.chat.completions.create( model="llama-3.2-3B-instruct", messages=history, stream=True ) for chunk in completion: delta = chunk.choices[0].delta content = getattr(delta, 'content', '') if content: history[-1]['content'] += content # Yield both the cleared textbox ("") and updated history yield "", history except Exception as e: # Handle exceptions and display an error message history[-1]['content'] += f"\n[Error]: {str(e)}" yield "", history # Wire up the single chat function to the UI msg.submit( fn=chat, # single consolidated function inputs=[msg, chatbot], # pass user message and current chatbot history outputs=[msg, chatbot] # clear the message box and update the chatbot ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.launch()