Multimodal AI Assistant with Memory and Speech Multimodal Input Support: Accepts images, video, text, and audio inputs, allowing for versatile applications such as visual question answering, speech recognition, and more. Real-Time Speech Interaction: Supports bilingual real-time speech conversations with configurable voices, including features like emotion, speed, and style control, as well as end-to-end voice cloning and role play. GitHub Enhanced Visual Capabilities:
To upload files, please first save the app
import streamlit as st
import openai
import base64
import io
import requests
from PIL import Image
import cv2
import numpy as np
import pandas as pd
from gtts import gTTS
import pygame
import speech_recognition as sr
import tempfile
import os
from datetime import datetime
# Page config
st.set_page_config(
page_title="Multimodal AI Assistant",
page_icon="🤖",
layout="wide"
)
# Initialize session state
if "messages" not in st.session_state:
st.session_state.messages = []
if "voice_enabled" not in st.session_state:
st.session_state.voice_enabled = False
if "memory" not in st.session_state:
st.session_state.memory = {}
# Sidebar for configuration
with st.sidebar:
st.title("⚙️ Configuration")
# API Key input
api_key = st.text_input("OpenAI API Key", type="password", help="Enter your OpenAI API key")
if api_key:
openai.api_key = api_key
st.divider()
# Voice settings
st.subheader("🎙️ Voice Settings")
voice_enabled = st.checkbox("Enable Voice Interaction", value=st.session_state.voice_enabled)
st.session_state.voice_enabled = voice_enabled
if voice_enabled:
voice_language = st.selectbox("Voice Language", ["en", "es", "fr", "de", "it", "pt"], index=0)
voice_speed = st.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
voice_emotion = st.selectbox("Voice Style", ["neutral", "friendly", "professional", "enthusiastic"])
st.divider()
# Memory settings
st.subheader("🧠 Memory")
if st.button("Clear Memory"):
st.session_state.memory = {}
st.session_state.messages = []
st.success("Memory cleared!")
# Display memory info
if st.session_state.memory:
st.write("**Stored Information:**")
for key, value in st.session_state.memory.items():
st.write(f"- {key}: {value}")
# Main interface
st.title("🤖 Multimodal AI Assistant")
st.write("Upload images, videos, audio, or chat with text. I can see, hear, and remember!")
# Input tabs
tab1, tab2, tab3, tab4 = st.tabs(["💬 Text Chat", "🖼️ Image/Video", "🎵 Audio", "📊 Analysis"])
with tab1:
st.subheader("Text Conversation")
# Display chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
if message["type"] == "text":
st.markdown(message["content"])
elif message["type"] == "image":
st.image(message["content"], caption="Uploaded Image")
elif message["type"] == "audio":
st.audio(message["content"])
# Text input
if prompt := st.chat_input("Ask me anything..."):
# Add user message
st.session_state.messages.append({
"role": "user",
"content": prompt,
"type": "text",
"timestamp": datetime.now()
})
with st.chat_message("user"):
st.markdown(prompt)
# Generate response
with st.chat_message("assistant"):
if api_key:
try:
# Create context with memory
context = f"Previous context: {st.session_state.memory}\n\nUser: {prompt}"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful multimodal AI assistant with memory. Remember important information from conversations and refer to it when relevant."},
{"role": "user", "content": context}
]
)
assistant_response = response.choices[0].message.content
st.markdown(assistant_response)
# Store response
st.session_state.messages.append({
"role": "assistant",
"content": assistant_response,
"type": "text",
"timestamp": datetime.now()
})
# Extract and store memory
if "remember" in prompt.lower() or "my name is" in prompt.lower():
# Simple memory extraction
if "my name is" in prompt.lower():
name = prompt.lower().split("my name is")[1].strip()
st.session_state.memory["user_name"] = name
# Text-to-speech if enabled
if st.session_state.voice_enabled:
try:
tts = gTTS(text=assistant_response, lang=voice_language, slow=False)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tts.save(tmp_file.name)
st.audio(tmp_file.name)
except Exception as e:
st.error(f"Voice synthesis error: {e}")
except Exception as e:
st.error(f"Error: {e}")
else:
st.warning("Please enter your OpenAI API key in the sidebar.")
with tab2:
st.subheader("Visual Input")
# Image upload
uploaded_image = st.file_uploader("Upload an image", type=['png', 'jpg', 'jpeg', 'gif'])
if uploaded_image:
image = Image.open(uploaded_image)
st.image(image, caption="Uploaded Image", use_column_width=True)
# Image analysis prompt
image_prompt = st.text_input("Ask about this image:", placeholder="What do you see in this image?")
if st.button("Analyze Image") and image_prompt:
if api_key:
try:
# Convert image to base64
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode()
# Simulate vision analysis (OpenAI Vision API would be used in real implementation)
response_text = f"I can see an image you've uploaded. {image_prompt} (Note: Full vision analysis requires OpenAI Vision API)"
st.session_state.messages.append({
"role": "user",
"content": image,
"type": "image",
"prompt": image_prompt,
"timestamp": datetime.now()
})
st.session_state.messages.append({
"role": "assistant",
"content": response_text,
"type": "text",
"timestamp": datetime.now()
})
st.success(response_text)
except Exception as e:
st.error(f"Error analyzing image: {e}")
else:
st.warning("Please enter your OpenAI API key.")
# Video upload
st.divider()
uploaded_video = st.file_uploader("Upload a video", type=['mp4', 'avi', 'mov'])
if uploaded_video:
st.video(uploaded_video)
if st.button("Analyze Video"):
st.info("Video analysis would extract frames and analyze them sequentially.")
with tab3:
st.subheader("Audio Input")
# Audio recording
if st.session_state.voice_enabled:
st.write("🎤 Voice Recording")
# Audio input widget
audio_value = st.experimental_audio_input("Record a voice message")
if audio_value:
st.audio(audio_value)
if st.button("Process Audio"):
try:
# Speech recognition simulation
st.info("Processing speech... (In real implementation, this would use speech recognition)")
# Simulate transcription
transcribed_text = "This is a simulated transcription of your audio message."
st.session_state.messages.append({
"role": "user",
"content": audio_value,
"type": "audio",
"transcription": transcribed_text,
"timestamp": datetime.now()
})
st.success(f"Transcribed: {transcribed_text}")
except Exception as e:
st.error(f"Error processing audio: {e}")
# File upload for audio
uploaded_audio = st.file_uploader("Upload an audio file", type=['wav', 'mp3', 'ogg', 'm4a'])
if uploaded_audio:
st.audio(uploaded_audio)
if st.button("Transcribe Audio"):
st.info("Audio transcription would be processed here using speech recognition.")
with tab4:
st.subheader("Conversation Analysis")
if st.session_state.messages:
# Message statistics
total_messages = len(st.session_state.messages)
user_messages = len([m for m in st.session_state.messages if m["role"] == "user"])
assistant_messages = len([m for m in st.session_state.messages if m["role"] == "assistant"])
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Messages", total_messages)
with col2:
st.metric("User Messages", user_messages)
with col3:
st.metric("Assistant Messages", assistant_messages)
# Message types
message_types = {}
for message in st.session_state.messages:
msg_type = message.get("type", "text")
message_types[msg_type] = message_types.get(msg_type, 0) + 1
if message_types:
st.subheader("Message Types")
df = pd.DataFrame(list(message_types.items()), columns=["Type", "Count"])
st.bar_chart(df.set_index("Type"))
# Recent messages
st.subheader("Recent Messages")
for message in st.session_state.messages[-5:]:
with st.expander(f"{message['role'].title()} - {message.get('timestamp', 'Unknown time')}"):
if message["type"] == "text":
st.write(message["content"])
elif message["type"] == "image":
st.image(message["content"])
if "prompt" in message:
st.write(f"Prompt: {message['prompt']}")
elif message["type"] == "audio":
st.audio(message["content"])
if "transcription" in message:
st.write(f"Transcription: {message['transcription']}")
else:
st.info("No conversation data available yet. Start chatting to see analysis!")
# Footer
st.divider()
st.markdown("""
### Features:
- 💬 **Text Chat**: Natural language conversation with memory
- 🖼️ **Image Analysis**: Upload and analyze images (Vision API integration ready)
- 🎥 **Video Processing**: Video upload and frame analysis
- 🎵 **Audio Processing**: Speech-to-text and text-to-speech
- 🧠 **Memory**: Persistent conversation memory
- 📊 **Analytics**: Conversation insights and statistics
- 🌍 **Multilingual**: Support for multiple languages
- 🎭 **Voice Styles**: Configurable voice emotions and speed
*Note: Some features require OpenAI API key and additional API integrations for full functionality.*
""")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?