import marimo __generated_with = "0.13.7" app = marimo.App(width="medium") @app.cell def _(): import requests import json import numpy as np import pandas as pd import time # To add delays # Choose your dimensionality reduction method # Option 1: UMAP (Recommended) try: import umap REDUCTION_METHOD = 'UMAP' except ImportError: print("UMAP not found, falling back to t-SNE. Install with: pip install umap-learn") from sklearn.manifold import TSNE REDUCTION_METHOD = 't-SNE' # Option 2: Force t-SNE # from sklearn.manifold import TSNE # REDUCTION_METHOD = 't-SNE' import plotly.express as px import plotly.graph_objects as go # For more control if needed return REDUCTION_METHOD, TSNE, json, np, pd, px, requests, time, umap @app.cell def _(): # --- Configuration --- OLLAMA_API_BASE_URL = "http://localhost:11434/api" EMBEDDING_MODEL = "nomic-embed-text" # For generating embeddings ANALYSIS_MODEL = "gemma3:1b" # LLM for sentiment/topic analysis (ensure pulled) REQUEST_TIMEOUT = 60 # Timeout for Ollama requests (seconds) RETRY_DELAY = 5 # Delay between retries (seconds) MAX_RETRIES = 3 # Max retries for Ollama requests return ( ANALYSIS_MODEL, EMBEDDING_MODEL, MAX_RETRIES, OLLAMA_API_BASE_URL, REQUEST_TIMEOUT, RETRY_DELAY, ) @app.cell def _(): # --- Data: Sentences to Analyze (Focus on Sentiment & Topics) --- sentences = [ # Positive Sentiments "I had an absolutely wonderful time on vacation!", # Travel, Positive "This is the best pizza I've ever tasted, truly amazing.",# Food, Positive "The new software update significantly improved performance.",# Tech, Positive "She was overjoyed to receive the award.", # Emotion, Positive "What a beautiful, sunny day for a picnic!", # Weather/Activity, Positive # Negative Sentiments "The airline lost my luggage, ruining the start of my trip.", # Travel, Negative "I found the meal to be bland and overpriced.", # Food, Negative "Debugging this legacy code is incredibly frustrating.", # Tech, Negative "He felt heartbroken and betrayed after the argument.", # Emotion, Negative "The constant rain made the whole weekend gloomy.", # Weather/Activity, Negative # Neutral/Objective Sentiments "The train is scheduled to depart at 3:00 PM.", # Travel, Neutral "The ingredients listed include flour, water, and salt.",# Food, Neutral "The system requires 8GB of RAM to operate.", # Tech, Neutral "Please file the report by the end of the day.", # Work/Instruction, Neutral/Work "The weather report indicates a chance of showers tomorrow.",# Weather, Neutral ] return (sentences,) @app.cell def _( EMBEDDING_MODEL, MAX_RETRIES, OLLAMA_API_BASE_URL, REQUEST_TIMEOUT, RETRY_DELAY, json, np, requests, sentences, time, ): # --- Helper Function for Ollama API Calls --- def ollama_request(endpoint, model_name, payload_key, payload_value, stream=False): """Sends a request to a specified Ollama endpoint and handles retries.""" url = f"{OLLAMA_API_BASE_URL}/{endpoint}" payload = { "model": model_name, payload_key: payload_value, "stream": stream # Important for generate endpoint if expecting single response } print(f" Sending request to {url} for model {model_name}...") # Debug info for attempt in range(MAX_RETRIES): try: response = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT) response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) # Handle potential streaming vs non-streaming (generate vs embeddings) if stream: # Process line-by-line if needed, for now assume generate returns single JSON when stream=False # This part might need adjustment depending on exact API behavior if stream=True full_response_content = "" for line in response.iter_lines(): if line: try: decoded_line = line.decode('utf-8') json_chunk = json.loads(decoded_line) # Accumulate response content if needed, e.g., from json_chunk.get('response', '') full_response_content += json_chunk.get('response', '') # Example for generate if json_chunk.get('done'): # If using generate with stream=True, the last chunk might have final details # For stream=False with generate, we expect one JSON object below pass except json.JSONDecodeError: print(f" Warning: Could not decode JSON line: {line}") # This needs careful implementation based on streaming needs. # For simple generation (stream=False), the non-streaming part below is better. # For simplicity, let's enforce stream=False for generate. print(" Warning: Streaming response handling is basic. Recommend stream=False for generate.") return {"response": full_response_content} # Example structure else: # Not streaming (embeddings or generate with stream=False) response_data = response.json() # print(f" Received response: {str(response_data)[:100]}...") # Debug: Print part of response return response_data except requests.exceptions.Timeout: print(f" Attempt {attempt + 1}/{MAX_RETRIES}: Request timed out.") except requests.exceptions.RequestException as e: print(f" Attempt {attempt + 1}/{MAX_RETRIES}: Request failed: {e}") if attempt < MAX_RETRIES - 1: print(f" Retrying in {RETRY_DELAY} seconds...") time.sleep(RETRY_DELAY) else: print(f" Max retries reached. Failing request for: {payload_value[:50]}...") return None # Indicate failure # --- 1. Get Embeddings from Ollama --- print(f"\n--- Getting Embeddings ({EMBEDDING_MODEL}) ---") embeddings = [] valid_sentences_indices = [] # Keep track of sentences for which we got embeddings for i1, sentence1 in enumerate(sentences): print(f"Processing sentence {i1+1}/{len(sentences)}: '{sentence1[:40]}...'") response_data = ollama_request("embeddings", EMBEDDING_MODEL, "prompt", sentence1) if response_data and "embedding" in response_data: embeddings.append(response_data["embedding"]) valid_sentences_indices.append(i1) else: print(f" Failed to get embedding for sentence {i1+1}. Skipping.") # Filter sentences list to only include those with successful embeddings sentences_filtered = [sentences[i] for i in valid_sentences_indices] if not embeddings: print("\nError: No embeddings were successfully retrieved. Exiting.") exit() embeddings_array = np.array(embeddings) print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.") return embeddings_array, ollama_request, sentences_filtered @app.cell def _(REDUCTION_METHOD, TSNE, embeddings_array, umap): # --- 2. Dimensionality Reduction --- print(f"\n--- Reducing Dimensionality ({REDUCTION_METHOD}) ---") n_samples = embeddings_array.shape[0] if n_samples <= 1: print("\nError: Need at least 2 data points for dimensionality reduction.") exit() reduced_embeddings = None if REDUCTION_METHOD == 'UMAP': n_neighbors_value = min(15, n_samples - 1) if n_neighbors_value < 2: n_neighbors_value = max(2, n_samples - 1) print(f" Using UMAP with n_neighbors={n_neighbors_value}") reducer = umap.UMAP( n_neighbors=n_neighbors_value, n_components=2, min_dist=0.1, metric='cosine', random_state=42 ) reduced_embeddings = reducer.fit_transform(embeddings_array) elif REDUCTION_METHOD == 't-SNE': perplexity_value = min(30, n_samples - 1) if perplexity_value <= 0: perplexity_value = max(1, n_samples - 1) print(f" Using t-SNE with perplexity={perplexity_value}") tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value, init='pca', learning_rate='auto') reduced_embeddings = tsne.fit_transform(embeddings_array) if reduced_embeddings is None: print("Error: Dimensionality reduction failed.") exit() print("Dimensionality reduction complete.") return (reduced_embeddings,) @app.cell def _(ANALYSIS_MODEL, ollama_request, sentences_filtered, time): # --- 3. Perform LLM Analysis (Sentiment & Topic) --- print(f"\n--- Performing LLM Analysis ({ANALYSIS_MODEL}) ---") analysis_results = [] possible_topics = ["Travel", "Food", "Tech", "Emotion", "Weather", "Work", "Other"] # Predefined topics # Define prompts (Keep them simple for smaller models) sentiment_prompt_template = """Rate the sentiment of the following sentence on a scale of 1 (very negative) to 5 (very positive). Output ONLY the number (1, 2, 3, 4, or 5). Sentence: {} Sentiment rating (1-5):""" topic_prompt_template = f"""Categorize the main topic of the following sentence into ONE of these categories: {', '.join(possible_topics)}. Output ONLY the category name. Sentence: {{}} Topic category:""" for i2, sentence2 in enumerate(sentences_filtered): print(f"Analyzing sentence {i2+1}/{len(sentences_filtered)}: '{sentence2[:40]}...'") sentence_analysis = {"sentence": sentence2, "sentiment_score": None, "topic": "Unknown"} # Defaults # --- Sentiment Analysis --- print(" Analyzing sentiment...") sentiment_prompt = sentiment_prompt_template.format(sentence2) sentiment_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", sentiment_prompt, stream=False) if sentiment_response and "response" in sentiment_response: llm_output = sentiment_response["response"].strip() try: # Try to extract just the number score = int(llm_output.split()[0].strip('.,!?;:"\'')) # Basic extraction if 1 <= score <= 5: sentence_analysis["sentiment_score"] = score print(f" Sentiment score: {score}") else: print(f" Warning: Sentiment score out of range ({score}). LLM Output: '{llm_output}'") except (ValueError, IndexError): print(f" Warning: Could not parse sentiment score. LLM Output: '{llm_output}'") else: print(" Failed to get sentiment response.") # Add a small delay to avoid overwhelming the API time.sleep(0.5) # --- Topic Analysis --- print(" Analyzing topic...") topic_prompt = topic_prompt_template.format(sentence2) topic_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", topic_prompt, stream=False) if topic_response and "response" in topic_response: llm_output = topic_response["response"].strip().capitalize() # Capitalize for consistency # Find the best match from our predefined list found_topic = "Unknown" for topic in possible_topics: if topic.lower() in llm_output.lower(): # Simple check if topic name is in output found_topic = topic break # Take the first match sentence_analysis["topic"] = found_topic print(f" Topic detected: {found_topic} (LLM Output: '{llm_output}')") if found_topic == "Unknown": print(f" Warning: Could not match topic to predefined list. LLM Output: '{llm_output}'") else: print(" Failed to get topic response.") analysis_results.append(sentence_analysis) time.sleep(0.5) # Another small delay return (analysis_results,) @app.cell def _(analysis_results, pd, reduced_embeddings): # --- 4. Prepare Data for Plotting --- print("\n--- Preparing Data for Plotting ---") df = pd.DataFrame(analysis_results) df['x'] = reduced_embeddings[:, 0] df['y'] = reduced_embeddings[:, 1] df['index'] = [i + 1 for i in range(len(df))] # Handle missing sentiment scores for coloring (e.g., set to neutral 3) df['sentiment_score_filled'] = df['sentiment_score'].fillna(3).astype(int) print("Analysis and data preparation complete.") # Display the resulting DataFrame (optional) # print("\nAnalysis DataFrame:") # print(df) return (df,) @app.cell def _(ANALYSIS_MODEL, EMBEDDING_MODEL, REDUCTION_METHOD, df, px): # --- 5. Visualization with Plotly (Interactive & Colored) --- def plot_interactive_analyzed(df, model_name, reduction_method_name="UMAP"): """ Generates an interactive scatter plot colored by sentiment, with hover info. """ # Define the color scale for sentiment (1=Red, 3=Yellow, 5=Green) # Using Plotly's built-in RdYlGn is good for this. color_scale = px.colors.diverging.RdYlGn fig = px.scatter( df, x='x', y='y', color='sentiment_score_filled', # Color points by sentiment score color_continuous_scale=color_scale, # Use the Red-Yellow-Green scale range_color=[1, 5], # Ensure scale covers the full 1-5 range color_continuous_midpoint=3, # Center the scale at neutral (3) text='index', # Display index number on points hover_name='index', # Show index number prominently on hover hover_data={ # Configure hover tooltips 'sentence': True, # Show full sentence 'sentiment_score': True,# Show original sentiment score (can be None) 'topic': True, # Show detected topic 'x': False, # Hide coordinates 'y': False, 'index': False, 'sentiment_score_filled': False # Hide the filled value }, title=f'Interactive 2D {reduction_method_name} of Sentence Embeddings ({model_name})
Colored by Sentiment (1-5), Analyzed by {ANALYSIS_MODEL}', labels={'x': f'{reduction_method_name} Component 1', 'y': f'{reduction_method_name} Component 2', 'color': 'Sentiment Score'} # Label for the color bar ) # --- Optional Customizations --- fig.update_traces( textposition='top center', textfont_size=10, marker=dict(size=8, # Slightly larger markers line=dict(width=1, color='DarkSlateGrey')) # Add outline to markers ) fig.update_layout( coloraxis_colorbar=dict( title="Sentiment Score", tickvals=[1, 2, 3, 4, 5], # Define ticks on the color bar ticktext=["1 (Neg)", "2", "3 (Neu)", "4", "5 (Pos)"], # Labels for ticks ), hovermode='closest', title_x=0.5 ) fig.show() # --- Call the plotting function --- plot_interactive_analyzed(df, EMBEDDING_MODEL, REDUCTION_METHOD) return @app.cell def _(): return if __name__ == "__main__": app.run()