Multimodal AI Assistant with Memory and Speech Multimodal Input Support: Accepts images, video, text, and audio inputs, allowing for versatile applications such as visual question answering, speech recognition, and more. Real-Time Speech Interaction: Supports bilingual real-time speech conversations with configurable voices, including features like emotion, speed, and style control, as well as end-to-end voice cloning and role play. GitHub Enhanced Visual Capabilities:
To upload files, please first save the app
import streamlit as st
import base64
from PIL import Image
import pandas as pd
import numpy as np
import io
import time
from datetime import datetime
# Configure the page
st.set_page_config(
page_title="Multimodal AI Assistant",
page_icon="🤖",
layout="wide",
initial_sidebar_state="expanded"
)
# Initialize session state
if 'conversation_history' not in st.session_state:
st.session_state.conversation_history = []
if 'current_conversation' not in st.session_state:
st.session_state.current_conversation = []
if 'voice_settings' not in st.session_state:
st.session_state.voice_settings = {
'voice': 'alloy',
'speed': 1.0,
'language': 'en'
}
def add_to_conversation(role, content, content_type="text", metadata=None):
"""Add a message to the current conversation"""
message = {
'role': role,
'content': content,
'content_type': content_type,
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'metadata': metadata or {}
}
st.session_state.current_conversation.append(message)
def simulate_ai_response(prompt, content_type="text"):
"""Simulate AI response based on input type"""
if content_type == "image":
return f"I can see an image. Based on what I observe, {prompt}. This appears to be a visual input that I can analyze for objects, text, scenes, and other visual elements."
elif content_type == "audio":
return f"I received an audio input. I can help transcribe speech, analyze audio content, or respond to voice commands. Your message: {prompt}"
elif content_type == "video":
return f"I can process video content. This appears to be a video file that I can analyze for motion, objects, scenes, and temporal changes. {prompt}"
else:
responses = [
f"Thank you for your message: '{prompt}'. I'm here to help with any questions or tasks you have.",
f"I understand you're asking about: {prompt}. Let me provide you with helpful information.",
f"Based on your input '{prompt}', I can assist you with analysis, suggestions, or detailed explanations.",
]
return np.random.choice(responses)
def display_conversation():
"""Display the conversation history"""
for message in st.session_state.current_conversation:
with st.chat_message(message['role']):
if message['content_type'] == 'image':
st.write(f"**Image uploaded at {message['timestamp']}**")
if 'image_data' in message['metadata']:
st.image(message['metadata']['image_data'], caption="Uploaded Image", use_column_width=True)
st.write(message['content'])
elif message['content_type'] == 'audio':
st.write(f"**Audio uploaded at {message['timestamp']}**")
if 'audio_data' in message['metadata']:
st.audio(message['metadata']['audio_data'])
st.write(message['content'])
elif message['content_type'] == 'video':
st.write(f"**Video uploaded at {message['timestamp']}**")
st.write(message['content'])
else:
st.write(message['content'])
# Sidebar for settings and memory
with st.sidebar:
st.header("🎛️ Assistant Settings")
# Voice settings
st.subheader("Voice Configuration")
voice_options = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
st.session_state.voice_settings['voice'] = st.selectbox(
"Voice Type",
voice_options,
index=voice_options.index(st.session_state.voice_settings['voice'])
)
st.session_state.voice_settings['speed'] = st.slider(
"Speech Speed",
min_value=0.5,
max_value=2.0,
value=st.session_state.voice_settings['speed'],
step=0.1
)
language_options = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
st.session_state.voice_settings['language'] = st.selectbox(
"Language",
language_options,
index=language_options.index(st.session_state.voice_settings['language'])
)
st.divider()
# Memory management
st.subheader("💾 Conversation Memory")
st.write(f"Current conversation: {len(st.session_state.current_conversation)} messages")
st.write(f"Total conversations: {len(st.session_state.conversation_history)}")
if st.button("Save Current Conversation"):
if st.session_state.current_conversation:
st.session_state.conversation_history.append({
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
'messages': st.session_state.current_conversation.copy()
})
st.success("Conversation saved to memory!")
if st.button("Clear Current Conversation"):
st.session_state.current_conversation = []
st.rerun()
if st.button("Clear All Memory"):
st.session_state.conversation_history = []
st.session_state.current_conversation = []
st.rerun()
# Main interface
st.title("🤖 Multimodal AI Assistant")
st.subheader("Chat with AI using Text, Images, Audio, and Video")
# Create tabs for different input types
tab1, tab2, tab3, tab4, tab5 = st.tabs(["💬 Chat", "🖼️ Images", "🎵 Audio", "🎥 Video", "📊 Memory"])
with tab1:
st.header("Text Conversation")
# Display conversation
if st.session_state.current_conversation:
display_conversation()
# Chat input
if prompt := st.chat_input("Type your message here..."):
# Add user message
add_to_conversation("user", prompt)
# Display user message
with st.chat_message("user"):
st.write(prompt)
# Generate and display AI response
with st.chat_message("assistant"):
response = simulate_ai_response(prompt)
st.write(response)
add_to_conversation("assistant", response)
st.rerun()
with tab2:
st.header("Visual Analysis")
uploaded_images = st.file_uploader(
"Upload images for analysis",
type=['png', 'jpg', 'jpeg', 'gif', 'bmp'],
accept_multiple_files=True
)
if uploaded_images:
for idx, uploaded_file in enumerate(uploaded_images):
col1, col2 = st.columns([1, 2])
with col1:
image = Image.open(uploaded_file)
st.image(image, caption=f"Image {idx+1}: {uploaded_file.name}", use_column_width=True)
with col2:
st.write(f"**Filename:** {uploaded_file.name}")
st.write(f"**Size:** {image.size}")
st.write(f"**Format:** {image.format}")
analysis_prompt = st.text_input(
f"What would you like to know about this image?",
key=f"image_prompt_{idx}",
placeholder="Describe what you see, identify objects, read text, etc."
)
if st.button(f"Analyze Image {idx+1}", key=f"analyze_{idx}"):
if analysis_prompt:
# Add to conversation
add_to_conversation(
"user",
f"Image analysis request: {analysis_prompt}",
"image",
{"image_data": image, "filename": uploaded_file.name}
)
response = simulate_ai_response(analysis_prompt, "image")
add_to_conversation("assistant", response, "text")
st.success("Image analysis added to conversation!")
st.write("**AI Response:**")
st.write(response)
with tab3:
st.header("Audio Processing")
# Audio recorder
st.subheader("🎙️ Record Audio")
recorded_audio = st.experimental_audio_input("Record your voice message")
if recorded_audio:
st.audio(recorded_audio)
audio_prompt = st.text_input(
"Describe what you want to do with this audio:",
placeholder="Transcribe, analyze, respond to voice command, etc."
)
if st.button("Process Audio") and audio_prompt:
add_to_conversation(
"user",
f"Audio processing request: {audio_prompt}",
"audio",
{"audio_data": recorded_audio}
)
response = simulate_ai_response(audio_prompt, "audio")
add_to_conversation("assistant", response)
st.success("Audio processing added to conversation!")
st.write("**AI Response:**")
st.write(response)
# File upload for audio
st.subheader("📁 Upload Audio Files")
uploaded_audio = st.file_uploader(
"Upload audio files",
type=['mp3', 'wav', 'ogg', 'm4a'],
accept_multiple_files=True
)
if uploaded_audio:
for audio_file in uploaded_audio:
st.audio(audio_file)
st.write(f"**Filename:** {audio_file.name}")
if st.button(f"Process {audio_file.name}"):
add_to_conversation(
"user",
f"Uploaded audio file: {audio_file.name}",
"audio",
{"audio_data": audio_file}
)
response = simulate_ai_response(f"Audio file: {audio_file.name}", "audio")
add_to_conversation("assistant", response)
st.success("Audio file processed!")
with tab4:
st.header("Video Analysis")
uploaded_videos = st.file_uploader(
"Upload video files for analysis",
type=['mp4', 'avi', 'mov', 'wmv', 'flv'],
accept_multiple_files=True
)
if uploaded_videos:
for video_file in uploaded_videos:
st.video(video_file)
st.write(f"**Filename:** {video_file.name}")
video_prompt = st.text_input(
f"What would you like to analyze in {video_file.name}?",
key=f"video_prompt_{video_file.name}",
placeholder="Describe scenes, detect objects, analyze motion, etc."
)
if st.button(f"Analyze {video_file.name}") and video_prompt:
add_to_conversation(
"user",
f"Video analysis request: {video_prompt}",
"video",
{"filename": video_file.name}
)
response = simulate_ai_response(video_prompt, "video")
add_to_conversation("assistant", response)
st.success("Video analysis added to conversation!")
st.write("**AI Response:**")
st.write(response)
with tab5:
st.header("Conversation Memory")
# Current conversation
if st.session_state.current_conversation:
st.subheader("Current Conversation")
st.write(f"Messages: {len(st.session_state.current_conversation)}")
with st.expander("View Current Conversation"):
for i, message in enumerate(st.session_state.current_conversation):
st.write(f"**{message['role'].title()}** ({message['timestamp']}):")
st.write(f"Type: {message['content_type']}")
st.write(message['content'])
st.divider()
# Conversation history
if st.session_state.conversation_history:
st.subheader("Saved Conversations")
for i, conversation in enumerate(st.session_state.conversation_history):
with st.expander(f"Conversation {i+1} - {conversation['timestamp']} ({len(conversation['messages'])} messages)"):
for message in conversation['messages']:
st.write(f"**{message['role'].title()}:** {message['content'][:100]}...")
if st.button(f"Load Conversation {i+1}", key=f"load_{i}"):
st.session_state.current_conversation = conversation['messages'].copy()
st.success("Conversation loaded!")
st.rerun()
# Export/Import functionality
st.subheader("Export/Import")
if st.button("Export All Data"):
export_data = {
'current_conversation': st.session_state.current_conversation,
'conversation_history': st.session_state.conversation_history,
'voice_settings': st.session_state.voice_settings
}
st.download_button(
label="Download Conversation Data",
data=str(export_data),
file_name=f"ai_assistant_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
mime="text/plain"
)
# Footer
st.divider()
st.caption("🤖 Multimodal AI Assistant - Supporting text, images, audio, and video interactions with persistent memory")
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?