Ploomber AI Editor | multimodal-ai-assistant-1a28

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import base64
from PIL import Image
import pandas as pd
import numpy as np
import io
import time
from datetime import datetime

# Configure the page
st.set_page_config(
    page_title="Multimodal AI Assistant",
    page_icon="🤖",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Initialize session state
if 'conversation_history' not in st.session_state:
    st.session_state.conversation_history = []
if 'current_conversation' not in st.session_state:
    st.session_state.current_conversation = []
if 'voice_settings' not in st.session_state:
    st.session_state.voice_settings = {
        'voice': 'alloy',
        'speed': 1.0,
        'language': 'en'
    }

def add_to_conversation(role, content, content_type="text", metadata=None):
    """Add a message to the current conversation"""
    message = {
        'role': role,
        'content': content,
        'content_type': content_type,
        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'metadata': metadata or {}
    }
    st.session_state.current_conversation.append(message)

def simulate_ai_response(prompt, content_type="text"):
    """Simulate AI response based on input type"""
    if content_type == "image":
        return f"I can see an image. Based on what I observe, {prompt}. This appears to be a visual input that I can analyze for objects, text, scenes, and other visual elements."
    elif content_type == "audio":
        return f"I received an audio input. I can help transcribe speech, analyze audio content, or respond to voice commands. Your message: {prompt}"
    elif content_type == "video":
        return f"I can process video content. This appears to be a video file that I can analyze for motion, objects, scenes, and temporal changes. {prompt}"
    else:
        responses = [
            f"Thank you for your message: '{prompt}'. I'm here to help with any questions or tasks you have.",
            f"I understand you're asking about: {prompt}. Let me provide you with helpful information.",
            f"Based on your input '{prompt}', I can assist you with analysis, suggestions, or detailed explanations.",
        ]
        return np.random.choice(responses)

def display_conversation():
    """Display the conversation history"""
    for message in st.session_state.current_conversation:
        with st.chat_message(message['role']):
            if message['content_type'] == 'image':
                st.write(f"**Image uploaded at {message['timestamp']}**")
                if 'image_data' in message['metadata']:
                    st.image(message['metadata']['image_data'], caption="Uploaded Image", use_column_width=True)
                st.write(message['content'])
            elif message['content_type'] == 'audio':
                st.write(f"**Audio uploaded at {message['timestamp']}**")
                if 'audio_data' in message['metadata']:
                    st.audio(message['metadata']['audio_data'])
                st.write(message['content'])
            elif message['content_type'] == 'video':
                st.write(f"**Video uploaded at {message['timestamp']}**")
                st.write(message['content'])
            else:
                st.write(message['content'])

# Sidebar for settings and memory
with st.sidebar:
    st.header("🎛️ Assistant Settings")
    
    # Voice settings
    st.subheader("Voice Configuration")
    voice_options = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']
    st.session_state.voice_settings['voice'] = st.selectbox(
        "Voice Type", 
        voice_options, 
        index=voice_options.index(st.session_state.voice_settings['voice'])
    )
    
    st.session_state.voice_settings['speed'] = st.slider(
        "Speech Speed", 
        min_value=0.5, 
        max_value=2.0, 
        value=st.session_state.voice_settings['speed'],
        step=0.1
    )
    
    language_options = ['en', 'es', 'fr', 'de', 'it', 'pt', 'ru', 'ja', 'ko', 'zh']
    st.session_state.voice_settings['language'] = st.selectbox(
        "Language", 
        language_options,
        index=language_options.index(st.session_state.voice_settings['language'])
    )
    
    st.divider()
    
    # Memory management
    st.subheader("💾 Conversation Memory")
    st.write(f"Current conversation: {len(st.session_state.current_conversation)} messages")
    st.write(f"Total conversations: {len(st.session_state.conversation_history)}")
    
    if st.button("Save Current Conversation"):
        if st.session_state.current_conversation:
            st.session_state.conversation_history.append({
                'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                'messages': st.session_state.current_conversation.copy()
            })
            st.success("Conversation saved to memory!")
    
    if st.button("Clear Current Conversation"):
        st.session_state.current_conversation = []
        st.rerun()
    
    if st.button("Clear All Memory"):
        st.session_state.conversation_history = []
        st.session_state.current_conversation = []
        st.rerun()

# Main interface
st.title("🤖 Multimodal AI Assistant")
st.subheader("Chat with AI using Text, Images, Audio, and Video")

# Create tabs for different input types
tab1, tab2, tab3, tab4, tab5 = st.tabs(["💬 Chat", "🖼️ Images", "🎵 Audio", "🎥 Video", "📊 Memory"])

with tab1:
    st.header("Text Conversation")
    
    # Display conversation
    if st.session_state.current_conversation:
        display_conversation()
    
    # Chat input
    if prompt := st.chat_input("Type your message here..."):
        # Add user message
        add_to_conversation("user", prompt)
        
        # Display user message
        with st.chat_message("user"):
            st.write(prompt)
        
        # Generate and display AI response
        with st.chat_message("assistant"):
            response = simulate_ai_response(prompt)
            st.write(response)
            add_to_conversation("assistant", response)
        
        st.rerun()

with tab2:
    st.header("Visual Analysis")
    
    uploaded_images = st.file_uploader(
        "Upload images for analysis",
        type=['png', 'jpg', 'jpeg', 'gif', 'bmp'],
        accept_multiple_files=True
    )
    
    if uploaded_images:
        for idx, uploaded_file in enumerate(uploaded_images):
            col1, col2 = st.columns([1, 2])
            
            with col1:
                image = Image.open(uploaded_file)
                st.image(image, caption=f"Image {idx+1}: {uploaded_file.name}", use_column_width=True)
            
            with col2:
                st.write(f"**Filename:** {uploaded_file.name}")
                st.write(f"**Size:** {image.size}")
                st.write(f"**Format:** {image.format}")
                
                analysis_prompt = st.text_input(
                    f"What would you like to know about this image?",
                    key=f"image_prompt_{idx}",
                    placeholder="Describe what you see, identify objects, read text, etc."
                )
                
                if st.button(f"Analyze Image {idx+1}", key=f"analyze_{idx}"):
                    if analysis_prompt:
                        # Add to conversation
                        add_to_conversation(
                            "user", 
                            f"Image analysis request: {analysis_prompt}",
                            "image",
                            {"image_data": image, "filename": uploaded_file.name}
                        )
                        
                        response = simulate_ai_response(analysis_prompt, "image")
                        add_to_conversation("assistant", response, "text")
                        
                        st.success("Image analysis added to conversation!")
                        st.write("**AI Response:**")
                        st.write(response)

with tab3:
    st.header("Audio Processing")
    
    # Audio recorder
    st.subheader("🎙️ Record Audio")
    recorded_audio = st.experimental_audio_input("Record your voice message")
    
    if recorded_audio:
        st.audio(recorded_audio)
        
        audio_prompt = st.text_input(
            "Describe what you want to do with this audio:",
            placeholder="Transcribe, analyze, respond to voice command, etc."
        )
        
        if st.button("Process Audio") and audio_prompt:
            add_to_conversation(
                "user",
                f"Audio processing request: {audio_prompt}",
                "audio",
                {"audio_data": recorded_audio}
            )
            
            response = simulate_ai_response(audio_prompt, "audio")
            add_to_conversation("assistant", response)
            
            st.success("Audio processing added to conversation!")
            st.write("**AI Response:**")
            st.write(response)
    
    # File upload for audio
    st.subheader("📁 Upload Audio Files")
    uploaded_audio = st.file_uploader(
        "Upload audio files",
        type=['mp3', 'wav', 'ogg', 'm4a'],
        accept_multiple_files=True
    )
    
    if uploaded_audio:
        for audio_file in uploaded_audio:
            st.audio(audio_file)
            st.write(f"**Filename:** {audio_file.name}")
            
            if st.button(f"Process {audio_file.name}"):
                add_to_conversation(
                    "user",
                    f"Uploaded audio file: {audio_file.name}",
                    "audio",
                    {"audio_data": audio_file}
                )
                
                response = simulate_ai_response(f"Audio file: {audio_file.name}", "audio")
                add_to_conversation("assistant", response)
                st.success("Audio file processed!")

with tab4:
    st.header("Video Analysis")
    
    uploaded_videos = st.file_uploader(
        "Upload video files for analysis",
        type=['mp4', 'avi', 'mov', 'wmv', 'flv'],
        accept_multiple_files=True
    )
    
    if uploaded_videos:
        for video_file in uploaded_videos:
            st.video(video_file)
            st.write(f"**Filename:** {video_file.name}")
            
            video_prompt = st.text_input(
                f"What would you like to analyze in {video_file.name}?",
                key=f"video_prompt_{video_file.name}",
                placeholder="Describe scenes, detect objects, analyze motion, etc."
            )
            
            if st.button(f"Analyze {video_file.name}") and video_prompt:
                add_to_conversation(
                    "user",
                    f"Video analysis request: {video_prompt}",
                    "video",
                    {"filename": video_file.name}
                )
                
                response = simulate_ai_response(video_prompt, "video")
                add_to_conversation("assistant", response)
                
                st.success("Video analysis added to conversation!")
                st.write("**AI Response:**")
                st.write(response)

with tab5:
    st.header("Conversation Memory")
    
    # Current conversation
    if st.session_state.current_conversation:
        st.subheader("Current Conversation")
        st.write(f"Messages: {len(st.session_state.current_conversation)}")
        
        with st.expander("View Current Conversation"):
            for i, message in enumerate(st.session_state.current_conversation):
                st.write(f"**{message['role'].title()}** ({message['timestamp']}):")
                st.write(f"Type: {message['content_type']}")
                st.write(message['content'])
                st.divider()
    
    # Conversation history
    if st.session_state.conversation_history:
        st.subheader("Saved Conversations")
        
        for i, conversation in enumerate(st.session_state.conversation_history):
            with st.expander(f"Conversation {i+1} - {conversation['timestamp']} ({len(conversation['messages'])} messages)"):
                for message in conversation['messages']:
                    st.write(f"**{message['role'].title()}:** {message['content'][:100]}...")
                
                if st.button(f"Load Conversation {i+1}", key=f"load_{i}"):
                    st.session_state.current_conversation = conversation['messages'].copy()
                    st.success("Conversation loaded!")
                    st.rerun()
    
    # Export/Import functionality
    st.subheader("Export/Import")
    
    if st.button("Export All Data"):
        export_data = {
            'current_conversation': st.session_state.current_conversation,
            'conversation_history': st.session_state.conversation_history,
            'voice_settings': st.session_state.voice_settings
        }
        
        st.download_button(
            label="Download Conversation Data",
            data=str(export_data),
            file_name=f"ai_assistant_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt",
            mime="text/plain"
        )

# Footer
st.divider()
st.caption("🤖 Multimodal AI Assistant - Supporting text, images, audio, and video interactions with persistent memory")
Loading code editor...
Click Save & Run to preview your app
Terminal