Write me an able to upload, download data, this data can be an xml, xlsx, csv files or a database. I want be able to anonymise or detect the sensitive data inside
To upload files, please first save the app
import streamlit as st
import pandas as pd
import xml.etree.ElementTree as ET
from io import StringIO, BytesIO
import sqlite3
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
import json
st.set_page_config(page_title="Data Anonymizer", layout="wide")
st.title("Data Anonymizer")
def detect_sensitive_data(text):
# Initialize the engine
analyzer = AnalyzerEngine()
# Analyze the text
results = analyzer.analyze(text=text,
language='en')
return results
def anonymize_text(text, results):
# Initialize the engine
anonymizer = AnonymizerEngine()
# Anonymize the detected entities
anonymized_text = anonymizer.anonymize(
text=text,
analyzer_results=results
)
return anonymized_text.text
def process_dataframe(df):
# Convert all columns to string type for analysis
df_str = df.astype(str)
# Process each cell
for column in df_str.columns:
for idx in df_str.index:
text = df_str.at[idx, column]
results = detect_sensitive_data(text)
if results:
df.at[idx, column] = anonymize_text(text, results)
return df
def read_xml_file(content):
tree = ET.parse(BytesIO(content))
root = tree.getroot()
# Convert XML to dict for easier processing
data = []
for child in root:
row = {}
for subchild in child:
row[subchild.tag] = subchild.text
data.append(row)
return pd.DataFrame(data)
def main():
st.write("Upload your data file (CSV, XLSX, XML) or connect to a database")
# File upload
uploaded_file = st.file_uploader("Choose a file", type=['csv', 'xlsx', 'xml'])
if uploaded_file is not None:
try:
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
elif uploaded_file.name.endswith('.xlsx'):
df = pd.read_excel(uploaded_file)
elif uploaded_file.name.endswith('.xml'):
df = read_xml_file(uploaded_file.read())
st.write("Original Data:")
st.dataframe(df)
if st.button("Detect and Anonymize Sensitive Data"):
processed_df = process_dataframe(df.copy())
st.write("Anonymized Data:")
st.dataframe(processed_df)
# Download buttons
csv = processed_df.to_csv(index=False)
excel_buffer = BytesIO()
processed_df.to_excel(excel_buffer, index=False)
col1, col2 = st.columns(2)
with col1:
st.download_button(
label="Download as CSV",
data=csv,
file_name="anonymized_data.csv",
mime="text/csv"
)
with col2:
st.download_button(
label="Download as Excel",
data=excel_buffer.getvalue(),
file_name="anonymized_data.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
except Exception as e:
st.error(f"Error processing file: {str(e)}")
# Database connection
st.write("---")
st.subheader("Connect to Database")
db_type = st.selectbox("Select Database Type", ["SQLite"])
if db_type == "SQLite":
db_file = st.file_uploader("Upload SQLite Database", type=['db', 'sqlite', 'sqlite3'])
if db_file is not None:
# Save the uploaded file to a temporary location
bytes_data = db_file.read()
# Create a temporary connection
conn = sqlite3.connect(':memory:')
temp_conn = sqlite3.connect(':memory:')
temp_conn.executescript(bytes_data.decode('utf-8'))
# Get list of tables
cursor = temp_conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
selected_table = st.selectbox("Select Table", [table[0] for table in tables])
if selected_table:
df = pd.read_sql_query(f"SELECT * FROM {selected_table}", temp_conn)
st.write("Original Data:")
st.dataframe(df)
if st.button("Detect and Anonymize Database Data"):
processed_df = process_dataframe(df.copy())
st.write("Anonymized Data:")
st.dataframe(processed_df)
# Download options
csv = processed_df.to_csv(index=False)
excel_buffer = BytesIO()
processed_df.to_excel(excel_buffer, index=False)
col1, col2 = st.columns(2)
with col1:
st.download_button(
label="Download as CSV",
data=csv,
file_name=f"anonymized_{selected_table}.csv",
mime="text/csv"
)
with col2:
st.download_button(
label="Download as Excel",
data=excel_buffer.getvalue(),
file_name=f"anonymized_{selected_table}.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
if __name__ == "__main__":
main()
Hi! I can help you with any questions about Streamlit and Python. What would you like to know?