import gradio as gr from openai import OpenAI # Initialize the OpenAI client client = OpenAI( api_key="EMPTY", base_url='https://llama-3-2-3b.societyai.com/openai/v1', ) with gr.Blocks(css="footer {visibility: hidden}") as demo: chatbot = gr.Chatbot(type="messages") msg = gr.Textbox() clear = gr.Button("Clear") def user(user_message, history: list): """Appends the user message to the conversation history.""" return "", history + [{"role": "user", "content": user_message}] def bot(history: list): """Sends the conversation history to the vLLM API and streams the assistant's response.""" # Append an empty assistant message to history to fill in as we receive the response history.append({"role": "assistant", "content": ""}) try: # Create a chat completion with streaming enabled using the client completion = client.chat.completions.create( model="llama-3.2-3B-instruct", # Adjust the model name if needed messages=history, stream=True ) # Iterate over the streamed response for chunk in completion: # Access the delta content from the chunk delta = chunk.choices[0].delta content = getattr(delta, 'content', '') if content: # Update the assistant's message with new content history[-1]['content'] += content yield history except Exception as e: # Handle exceptions and display an error message history[-1]['content'] += f"\n[Error]: {str(e)}" yield history # Set up the Gradio interface components msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then( bot, chatbot, chatbot ) clear.click(lambda: None, None, chatbot, queue=False) if __name__ == "__main__": demo.launch()