semantic-analysis/sentiment-analysis.py

import marimo

__generated_with = "0.13.7"
app = marimo.App(width="medium")


@app.cell
def _():
    import requests
    import json
    import numpy as np
    import pandas as pd
    import time # To add delays

    # Choose your dimensionality reduction method
    # Option 1: UMAP (Recommended)
    try:
        import umap
        REDUCTION_METHOD = 'UMAP'
    except ImportError:
        print("UMAP not found, falling back to t-SNE. Install with: pip install umap-learn")
        from sklearn.manifold import TSNE
        REDUCTION_METHOD = 't-SNE'

    # Option 2: Force t-SNE
    # from sklearn.manifold import TSNE
    # REDUCTION_METHOD = 't-SNE'

    import plotly.express as px
    import plotly.graph_objects as go # For more control if needed
    return REDUCTION_METHOD, TSNE, json, np, pd, px, requests, time, umap


@app.cell
def _():
    # --- Configuration ---
    OLLAMA_API_BASE_URL = "http://localhost:11434/api"
    EMBEDDING_MODEL = "nomic-embed-text"  # For generating embeddings
    ANALYSIS_MODEL = "gemma3:1b"        # LLM for sentiment/topic analysis (ensure pulled)
    REQUEST_TIMEOUT = 60               # Timeout for Ollama requests (seconds)
    RETRY_DELAY = 5                    # Delay between retries (seconds)
    MAX_RETRIES = 3                    # Max retries for Ollama requests
    return (
        ANALYSIS_MODEL,
        EMBEDDING_MODEL,
        MAX_RETRIES,
        OLLAMA_API_BASE_URL,
        REQUEST_TIMEOUT,
        RETRY_DELAY,
    )


@app.cell
def _():
    # --- Data: Sentences to Analyze (Focus on Sentiment & Topics) ---
    sentences = [
        # Positive Sentiments
        "I had an absolutely wonderful time on vacation!",        # Travel, Positive
        "This is the best pizza I've ever tasted, truly amazing.",# Food, Positive
        "The new software update significantly improved performance.",# Tech, Positive
        "She was overjoyed to receive the award.",               # Emotion, Positive
        "What a beautiful, sunny day for a picnic!",            # Weather/Activity, Positive

        # Negative Sentiments
        "The airline lost my luggage, ruining the start of my trip.", # Travel, Negative
        "I found the meal to be bland and overpriced.",          # Food, Negative
        "Debugging this legacy code is incredibly frustrating.",  # Tech, Negative
        "He felt heartbroken and betrayed after the argument.",  # Emotion, Negative
        "The constant rain made the whole weekend gloomy.",     # Weather/Activity, Negative

        # Neutral/Objective Sentiments
        "The train is scheduled to depart at 3:00 PM.",          # Travel, Neutral
        "The ingredients listed include flour, water, and salt.",# Food, Neutral
        "The system requires 8GB of RAM to operate.",           # Tech, Neutral
        "Please file the report by the end of the day.",        # Work/Instruction, Neutral/Work
        "The weather report indicates a chance of showers tomorrow.",# Weather, Neutral
    ]
    return (sentences,)


@app.cell
def _(
    EMBEDDING_MODEL,
    MAX_RETRIES,
    OLLAMA_API_BASE_URL,
    REQUEST_TIMEOUT,
    RETRY_DELAY,
    json,
    np,
    requests,
    sentences,
    time,
):
    # --- Helper Function for Ollama API Calls ---
    def ollama_request(endpoint, model_name, payload_key, payload_value, stream=False):
        """Sends a request to a specified Ollama endpoint and handles retries."""
        url = f"{OLLAMA_API_BASE_URL}/{endpoint}"
        payload = {
            "model": model_name,
            payload_key: payload_value,
            "stream": stream # Important for generate endpoint if expecting single response
        }
        print(f"  Sending request to {url} for model {model_name}...") # Debug info

        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT)
                response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

                # Handle potential streaming vs non-streaming (generate vs embeddings)
                if stream:
                     # Process line-by-line if needed, for now assume generate returns single JSON when stream=False
                     # This part might need adjustment depending on exact API behavior if stream=True
                     full_response_content = ""
                     for line in response.iter_lines():
                         if line:
                             try:
                                  decoded_line = line.decode('utf-8')
                                  json_chunk = json.loads(decoded_line)
                                  # Accumulate response content if needed, e.g., from json_chunk.get('response', '')
                                  full_response_content += json_chunk.get('response', '') # Example for generate
                                  if json_chunk.get('done'):
                                       # If using generate with stream=True, the last chunk might have final details
                                       # For stream=False with generate, we expect one JSON object below
                                       pass
                             except json.JSONDecodeError:
                                  print(f"   Warning: Could not decode JSON line: {line}")
                     # This needs careful implementation based on streaming needs.
                     # For simple generation (stream=False), the non-streaming part below is better.
                     # For simplicity, let's enforce stream=False for generate.
                     print("   Warning: Streaming response handling is basic. Recommend stream=False for generate.")
                     return {"response": full_response_content} # Example structure

                else: # Not streaming (embeddings or generate with stream=False)
                     response_data = response.json()
                     # print(f"   Received response: {str(response_data)[:100]}...") # Debug: Print part of response
                     return response_data

            except requests.exceptions.Timeout:
                print(f"   Attempt {attempt + 1}/{MAX_RETRIES}: Request timed out.")
            except requests.exceptions.RequestException as e:
                print(f"   Attempt {attempt + 1}/{MAX_RETRIES}: Request failed: {e}")

            if attempt < MAX_RETRIES - 1:
                print(f"   Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
            else:
                print(f"   Max retries reached. Failing request for: {payload_value[:50]}...")
                return None # Indicate failure

    # --- 1. Get Embeddings from Ollama ---
    print(f"\n--- Getting Embeddings ({EMBEDDING_MODEL}) ---")
    embeddings = []
    valid_sentences_indices = [] # Keep track of sentences for which we got embeddings
    for i1, sentence1 in enumerate(sentences):
        print(f"Processing sentence {i1+1}/{len(sentences)}: '{sentence1[:40]}...'")
        response_data = ollama_request("embeddings", EMBEDDING_MODEL, "prompt", sentence1)
        if response_data and "embedding" in response_data:
            embeddings.append(response_data["embedding"])
            valid_sentences_indices.append(i1)
        else:
            print(f"  Failed to get embedding for sentence {i1+1}. Skipping.")

    # Filter sentences list to only include those with successful embeddings
    sentences_filtered = [sentences[i] for i in valid_sentences_indices]
    if not embeddings:
        print("\nError: No embeddings were successfully retrieved. Exiting.")
        exit()

    embeddings_array = np.array(embeddings)
    print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.")
    return embeddings_array, ollama_request, sentences_filtered


@app.cell
def _(REDUCTION_METHOD, TSNE, embeddings_array, umap):
    # --- 2. Dimensionality Reduction ---
    print(f"\n--- Reducing Dimensionality ({REDUCTION_METHOD}) ---")
    n_samples = embeddings_array.shape[0]
    if n_samples <= 1:
        print("\nError: Need at least 2 data points for dimensionality reduction.")
        exit()

    reduced_embeddings = None
    if REDUCTION_METHOD == 'UMAP':
        n_neighbors_value = min(15, n_samples - 1)
        if n_neighbors_value < 2: n_neighbors_value = max(2, n_samples - 1)
        print(f"  Using UMAP with n_neighbors={n_neighbors_value}")
        reducer = umap.UMAP(
            n_neighbors=n_neighbors_value, n_components=2, min_dist=0.1,
            metric='cosine', random_state=42
        )
        reduced_embeddings = reducer.fit_transform(embeddings_array)
    elif REDUCTION_METHOD == 't-SNE':
        perplexity_value = min(30, n_samples - 1)
        if perplexity_value <= 0: perplexity_value = max(1, n_samples - 1)
        print(f"  Using t-SNE with perplexity={perplexity_value}")
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value,
                    init='pca', learning_rate='auto')
        reduced_embeddings = tsne.fit_transform(embeddings_array)

    if reduced_embeddings is None:
         print("Error: Dimensionality reduction failed.")
         exit()

    print("Dimensionality reduction complete.")
    return (reduced_embeddings,)


@app.cell
def _(ANALYSIS_MODEL, ollama_request, sentences_filtered, time):
    # --- 3. Perform LLM Analysis (Sentiment & Topic) ---
    print(f"\n--- Performing LLM Analysis ({ANALYSIS_MODEL}) ---")
    analysis_results = []
    possible_topics = ["Travel", "Food", "Tech", "Emotion", "Weather", "Work", "Other"] # Predefined topics

    # Define prompts (Keep them simple for smaller models)
    sentiment_prompt_template = """Rate the sentiment of the following sentence on a scale of 1 (very negative) to 5 (very positive). Output ONLY the number (1, 2, 3, 4, or 5).
    Sentence: {}
    Sentiment rating (1-5):"""

    topic_prompt_template = f"""Categorize the main topic of the following sentence into ONE of these categories: {', '.join(possible_topics)}. Output ONLY the category name.
    Sentence: {{}}
    Topic category:"""

    for i2, sentence2 in enumerate(sentences_filtered):
        print(f"Analyzing sentence {i2+1}/{len(sentences_filtered)}: '{sentence2[:40]}...'")
        sentence_analysis = {"sentence": sentence2, "sentiment_score": None, "topic": "Unknown"} # Defaults

        # --- Sentiment Analysis ---
        print("  Analyzing sentiment...")
        sentiment_prompt = sentiment_prompt_template.format(sentence2)
        sentiment_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", sentiment_prompt, stream=False)

        if sentiment_response and "response" in sentiment_response:
            llm_output = sentiment_response["response"].strip()
            try:
                # Try to extract just the number
                score = int(llm_output.split()[0].strip('.,!?;:"\'')) # Basic extraction
                if 1 <= score <= 5:
                    sentence_analysis["sentiment_score"] = score
                    print(f"    Sentiment score: {score}")
                else:
                     print(f"    Warning: Sentiment score out of range ({score}). LLM Output: '{llm_output}'")
            except (ValueError, IndexError):
                print(f"    Warning: Could not parse sentiment score. LLM Output: '{llm_output}'")
        else:
            print("    Failed to get sentiment response.")

        # Add a small delay to avoid overwhelming the API
        time.sleep(0.5)

        # --- Topic Analysis ---
        print("  Analyzing topic...")
        topic_prompt = topic_prompt_template.format(sentence2)
        topic_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", topic_prompt, stream=False)

        if topic_response and "response" in topic_response:
            llm_output = topic_response["response"].strip().capitalize() # Capitalize for consistency
             # Find the best match from our predefined list
            found_topic = "Unknown"
            for topic in possible_topics:
                 if topic.lower() in llm_output.lower(): # Simple check if topic name is in output
                     found_topic = topic
                     break # Take the first match
            sentence_analysis["topic"] = found_topic
            print(f"    Topic detected: {found_topic} (LLM Output: '{llm_output}')")

            if found_topic == "Unknown":
                 print(f"    Warning: Could not match topic to predefined list. LLM Output: '{llm_output}'")
        else:
            print("    Failed to get topic response.")

        analysis_results.append(sentence_analysis)
        time.sleep(0.5) # Another small delay
    return (analysis_results,)


@app.cell
def _(analysis_results, pd, reduced_embeddings):
    # --- 4. Prepare Data for Plotting ---
    print("\n--- Preparing Data for Plotting ---")
    df = pd.DataFrame(analysis_results)
    df['x'] = reduced_embeddings[:, 0]
    df['y'] = reduced_embeddings[:, 1]
    df['index'] = [i + 1 for i in range(len(df))]

    # Handle missing sentiment scores for coloring (e.g., set to neutral 3)
    df['sentiment_score_filled'] = df['sentiment_score'].fillna(3).astype(int)

    print("Analysis and data preparation complete.")
    # Display the resulting DataFrame (optional)
    # print("\nAnalysis DataFrame:")
    # print(df)
    return (df,)


@app.cell
def _(ANALYSIS_MODEL, EMBEDDING_MODEL, REDUCTION_METHOD, df, px):
    # --- 5. Visualization with Plotly (Interactive & Colored) ---

    def plot_interactive_analyzed(df, model_name, reduction_method_name="UMAP"):
        """
        Generates an interactive scatter plot colored by sentiment, with hover info.
        """
        # Define the color scale for sentiment (1=Red, 3=Yellow, 5=Green)
        # Using Plotly's built-in RdYlGn is good for this.
        color_scale = px.colors.diverging.RdYlGn

        fig = px.scatter(
            df,
            x='x',
            y='y',
            color='sentiment_score_filled',  # Color points by sentiment score
            color_continuous_scale=color_scale, # Use the Red-Yellow-Green scale
            range_color=[1, 5],          # Ensure scale covers the full 1-5 range
            color_continuous_midpoint=3, # Center the scale at neutral (3)
            text='index',               # Display index number on points
            hover_name='index',         # Show index number prominently on hover
            hover_data={                # Configure hover tooltips
                'sentence': True,       # Show full sentence
                'sentiment_score': True,# Show original sentiment score (can be None)
                'topic': True,          # Show detected topic
                'x': False,             # Hide coordinates
                'y': False,
                'index': False,
                'sentiment_score_filled': False # Hide the filled value
            },
            title=f'Interactive 2D {reduction_method_name} of Sentence Embeddings ({model_name})<br>Colored by Sentiment (1-5), Analyzed by {ANALYSIS_MODEL}',
            labels={'x': f'{reduction_method_name} Component 1',
                    'y': f'{reduction_method_name} Component 2',
                    'color': 'Sentiment Score'} # Label for the color bar
        )

        # --- Optional Customizations ---
        fig.update_traces(
            textposition='top center',
            textfont_size=10,
            marker=dict(size=8, # Slightly larger markers
                        line=dict(width=1, color='DarkSlateGrey')) # Add outline to markers
        )
        fig.update_layout(
            coloraxis_colorbar=dict(
                title="Sentiment Score",
                tickvals=[1, 2, 3, 4, 5], # Define ticks on the color bar
                ticktext=["1 (Neg)", "2", "3 (Neu)", "4", "5 (Pos)"], # Labels for ticks
            ),
            hovermode='closest',
            title_x=0.5
        )

        fig.show()

    # --- Call the plotting function ---
    plot_interactive_analyzed(df, EMBEDDING_MODEL, REDUCTION_METHOD)
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()