Ploomber AI Editor | multimodal-ai-assistant-66af

To upload files, please first save the app
Code Editor for app.py

import streamlit as st
import openai
import base64
import io
import requests
from PIL import Image
import cv2
import numpy as np
import pandas as pd
from gtts import gTTS
import pygame
import speech_recognition as sr
import tempfile
import os
from datetime import datetime

# Page config
st.set_page_config(
    page_title="Multimodal AI Assistant",
    page_icon="🤖",
    layout="wide"
)

# Initialize session state
if "messages" not in st.session_state:
    st.session_state.messages = []
if "voice_enabled" not in st.session_state:
    st.session_state.voice_enabled = False
if "memory" not in st.session_state:
    st.session_state.memory = {}

# Sidebar for configuration
with st.sidebar:
    st.title("⚙️ Configuration")
    
    # API Key input
    api_key = st.text_input("OpenAI API Key", type="password", help="Enter your OpenAI API key")
    
    if api_key:
        openai.api_key = api_key
    
    st.divider()
    
    # Voice settings
    st.subheader("🎙️ Voice Settings")
    voice_enabled = st.checkbox("Enable Voice Interaction", value=st.session_state.voice_enabled)
    st.session_state.voice_enabled = voice_enabled
    
    if voice_enabled:
        voice_language = st.selectbox("Voice Language", ["en", "es", "fr", "de", "it", "pt"], index=0)
        voice_speed = st.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
        voice_emotion = st.selectbox("Voice Style", ["neutral", "friendly", "professional", "enthusiastic"])
    
    st.divider()
    
    # Memory settings
    st.subheader("🧠 Memory")
    if st.button("Clear Memory"):
        st.session_state.memory = {}
        st.session_state.messages = []
        st.success("Memory cleared!")
    
    # Display memory info
    if st.session_state.memory:
        st.write("**Stored Information:**")
        for key, value in st.session_state.memory.items():
            st.write(f"- {key}: {value}")

# Main interface
st.title("🤖 Multimodal AI Assistant")
st.write("Upload images, videos, audio, or chat with text. I can see, hear, and remember!")

# Input tabs
tab1, tab2, tab3, tab4 = st.tabs(["💬 Text Chat", "🖼️ Image/Video", "🎵 Audio", "📊 Analysis"])

with tab1:
    st.subheader("Text Conversation")
    
    # Display chat history
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            if message["type"] == "text":
                st.markdown(message["content"])
            elif message["type"] == "image":
                st.image(message["content"], caption="Uploaded Image")
            elif message["type"] == "audio":
                st.audio(message["content"])
    
    # Text input
    if prompt := st.chat_input("Ask me anything..."):
        # Add user message
        st.session_state.messages.append({
            "role": "user", 
            "content": prompt,
            "type": "text",
            "timestamp": datetime.now()
        })
        
        with st.chat_message("user"):
            st.markdown(prompt)
        
        # Generate response
        with st.chat_message("assistant"):
            if api_key:
                try:
                    # Create context with memory
                    context = f"Previous context: {st.session_state.memory}\n\nUser: {prompt}"
                    
                    response = openai.ChatCompletion.create(
                        model="gpt-3.5-turbo",
                        messages=[
                            {"role": "system", "content": "You are a helpful multimodal AI assistant with memory. Remember important information from conversations and refer to it when relevant."},
                            {"role": "user", "content": context}
                        ]
                    )
                    
                    assistant_response = response.choices[0].message.content
                    st.markdown(assistant_response)
                    
                    # Store response
                    st.session_state.messages.append({
                        "role": "assistant",
                        "content": assistant_response,
                        "type": "text",
                        "timestamp": datetime.now()
                    })
                    
                    # Extract and store memory
                    if "remember" in prompt.lower() or "my name is" in prompt.lower():
                        # Simple memory extraction
                        if "my name is" in prompt.lower():
                            name = prompt.lower().split("my name is")[1].strip()
                            st.session_state.memory["user_name"] = name
                    
                    # Text-to-speech if enabled
                    if st.session_state.voice_enabled:
                        try:
                            tts = gTTS(text=assistant_response, lang=voice_language, slow=False)
                            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                                tts.save(tmp_file.name)
                                st.audio(tmp_file.name)
                        except Exception as e:
                            st.error(f"Voice synthesis error: {e}")
                    
                except Exception as e:
                    st.error(f"Error: {e}")
            else:
                st.warning("Please enter your OpenAI API key in the sidebar.")

with tab2:
    st.subheader("Visual Input")
    
    # Image upload
    uploaded_image = st.file_uploader("Upload an image", type=['png', 'jpg', 'jpeg', 'gif'])
    
    if uploaded_image:
        image = Image.open(uploaded_image)
        st.image(image, caption="Uploaded Image", use_column_width=True)
        
        # Image analysis prompt
        image_prompt = st.text_input("Ask about this image:", placeholder="What do you see in this image?")
        
        if st.button("Analyze Image") and image_prompt:
            if api_key:
                try:
                    # Convert image to base64
                    buffered = io.BytesIO()
                    image.save(buffered, format="PNG")
                    img_str = base64.b64encode(buffered.getvalue()).decode()
                    
                    # Simulate vision analysis (OpenAI Vision API would be used in real implementation)
                    response_text = f"I can see an image you've uploaded. {image_prompt} (Note: Full vision analysis requires OpenAI Vision API)"
                    
                    st.session_state.messages.append({
                        "role": "user",
                        "content": image,
                        "type": "image",
                        "prompt": image_prompt,
                        "timestamp": datetime.now()
                    })
                    
                    st.session_state.messages.append({
                        "role": "assistant",
                        "content": response_text,
                        "type": "text",
                        "timestamp": datetime.now()
                    })
                    
                    st.success(response_text)
                    
                except Exception as e:
                    st.error(f"Error analyzing image: {e}")
            else:
                st.warning("Please enter your OpenAI API key.")
    
    # Video upload
    st.divider()
    uploaded_video = st.file_uploader("Upload a video", type=['mp4', 'avi', 'mov'])
    
    if uploaded_video:
        st.video(uploaded_video)
        
        if st.button("Analyze Video"):
            st.info("Video analysis would extract frames and analyze them sequentially.")

with tab3:
    st.subheader("Audio Input")
    
    # Audio recording
    if st.session_state.voice_enabled:
        st.write("🎤 Voice Recording")
        
        # Audio input widget
        audio_value = st.experimental_audio_input("Record a voice message")
        
        if audio_value:
            st.audio(audio_value)
            
            if st.button("Process Audio"):
                try:
                    # Speech recognition simulation
                    st.info("Processing speech... (In real implementation, this would use speech recognition)")
                    
                    # Simulate transcription
                    transcribed_text = "This is a simulated transcription of your audio message."
                    
                    st.session_state.messages.append({
                        "role": "user",
                        "content": audio_value,
                        "type": "audio",
                        "transcription": transcribed_text,
                        "timestamp": datetime.now()
                    })
                    
                    st.success(f"Transcribed: {transcribed_text}")
                    
                except Exception as e:
                    st.error(f"Error processing audio: {e}")
    
    # File upload for audio
    uploaded_audio = st.file_uploader("Upload an audio file", type=['wav', 'mp3', 'ogg', 'm4a'])
    
    if uploaded_audio:
        st.audio(uploaded_audio)
        
        if st.button("Transcribe Audio"):
            st.info("Audio transcription would be processed here using speech recognition.")

with tab4:
    st.subheader("Conversation Analysis")
    
    if st.session_state.messages:
        # Message statistics
        total_messages = len(st.session_state.messages)
        user_messages = len([m for m in st.session_state.messages if m["role"] == "user"])
        assistant_messages = len([m for m in st.session_state.messages if m["role"] == "assistant"])
        
        col1, col2, col3 = st.columns(3)
        with col1:
            st.metric("Total Messages", total_messages)
        with col2:
            st.metric("User Messages", user_messages)
        with col3:
            st.metric("Assistant Messages", assistant_messages)
        
        # Message types
        message_types = {}
        for message in st.session_state.messages:
            msg_type = message.get("type", "text")
            message_types[msg_type] = message_types.get(msg_type, 0) + 1
        
        if message_types:
            st.subheader("Message Types")
            df = pd.DataFrame(list(message_types.items()), columns=["Type", "Count"])
            st.bar_chart(df.set_index("Type"))
        
        # Recent messages
        st.subheader("Recent Messages")
        for message in st.session_state.messages[-5:]:
            with st.expander(f"{message['role'].title()} - {message.get('timestamp', 'Unknown time')}"):
                if message["type"] == "text":
                    st.write(message["content"])
                elif message["type"] == "image":
                    st.image(message["content"])
                    if "prompt" in message:
                        st.write(f"Prompt: {message['prompt']}")
                elif message["type"] == "audio":
                    st.audio(message["content"])
                    if "transcription" in message:
                        st.write(f"Transcription: {message['transcription']}")
    else:
        st.info("No conversation data available yet. Start chatting to see analysis!")

# Footer
st.divider()
st.markdown("""
### Features:
- 💬 **Text Chat**: Natural language conversation with memory
- 🖼️ **Image Analysis**: Upload and analyze images (Vision API integration ready)
- 🎥 **Video Processing**: Video upload and frame analysis
- 🎵 **Audio Processing**: Speech-to-text and text-to-speech
- 🧠 **Memory**: Persistent conversation memory
- 📊 **Analytics**: Conversation insights and statistics
- 🌍 **Multilingual**: Support for multiple languages
- 🎭 **Voice Styles**: Configurable voice emotions and speed

*Note: Some features require OpenAI API key and additional API integrations for full functionality.*
""")
Loading code editor...
Click Save & Run to preview your app
Terminal