You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

366 lines
16 KiB
Python

import marimo
__generated_with = "0.13.7"
app = marimo.App(width="medium")
@app.cell
def _():
import requests
import json
import numpy as np
import pandas as pd
import time # To add delays
# Choose your dimensionality reduction method
# Option 1: UMAP (Recommended)
try:
import umap
REDUCTION_METHOD = 'UMAP'
except ImportError:
print("UMAP not found, falling back to t-SNE. Install with: pip install umap-learn")
from sklearn.manifold import TSNE
REDUCTION_METHOD = 't-SNE'
# Option 2: Force t-SNE
# from sklearn.manifold import TSNE
# REDUCTION_METHOD = 't-SNE'
import plotly.express as px
import plotly.graph_objects as go # For more control if needed
return REDUCTION_METHOD, TSNE, json, np, pd, px, requests, time, umap
@app.cell
def _():
# --- Configuration ---
OLLAMA_API_BASE_URL = "http://localhost:11434/api"
EMBEDDING_MODEL = "nomic-embed-text" # For generating embeddings
ANALYSIS_MODEL = "gemma3:1b" # LLM for sentiment/topic analysis (ensure pulled)
REQUEST_TIMEOUT = 60 # Timeout for Ollama requests (seconds)
RETRY_DELAY = 5 # Delay between retries (seconds)
MAX_RETRIES = 3 # Max retries for Ollama requests
return (
ANALYSIS_MODEL,
EMBEDDING_MODEL,
MAX_RETRIES,
OLLAMA_API_BASE_URL,
REQUEST_TIMEOUT,
RETRY_DELAY,
)
@app.cell
def _():
# --- Data: Sentences to Analyze (Focus on Sentiment & Topics) ---
sentences = [
# Positive Sentiments
"I had an absolutely wonderful time on vacation!", # Travel, Positive
"This is the best pizza I've ever tasted, truly amazing.",# Food, Positive
"The new software update significantly improved performance.",# Tech, Positive
"She was overjoyed to receive the award.", # Emotion, Positive
"What a beautiful, sunny day for a picnic!", # Weather/Activity, Positive
# Negative Sentiments
"The airline lost my luggage, ruining the start of my trip.", # Travel, Negative
"I found the meal to be bland and overpriced.", # Food, Negative
"Debugging this legacy code is incredibly frustrating.", # Tech, Negative
"He felt heartbroken and betrayed after the argument.", # Emotion, Negative
"The constant rain made the whole weekend gloomy.", # Weather/Activity, Negative
# Neutral/Objective Sentiments
"The train is scheduled to depart at 3:00 PM.", # Travel, Neutral
"The ingredients listed include flour, water, and salt.",# Food, Neutral
"The system requires 8GB of RAM to operate.", # Tech, Neutral
"Please file the report by the end of the day.", # Work/Instruction, Neutral/Work
"The weather report indicates a chance of showers tomorrow.",# Weather, Neutral
]
return (sentences,)
@app.cell
def _(
EMBEDDING_MODEL,
MAX_RETRIES,
OLLAMA_API_BASE_URL,
REQUEST_TIMEOUT,
RETRY_DELAY,
json,
np,
requests,
sentences,
time,
):
# --- Helper Function for Ollama API Calls ---
def ollama_request(endpoint, model_name, payload_key, payload_value, stream=False):
"""Sends a request to a specified Ollama endpoint and handles retries."""
url = f"{OLLAMA_API_BASE_URL}/{endpoint}"
payload = {
"model": model_name,
payload_key: payload_value,
"stream": stream # Important for generate endpoint if expecting single response
}
print(f" Sending request to {url} for model {model_name}...") # Debug info
for attempt in range(MAX_RETRIES):
try:
response = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
# Handle potential streaming vs non-streaming (generate vs embeddings)
if stream:
# Process line-by-line if needed, for now assume generate returns single JSON when stream=False
# This part might need adjustment depending on exact API behavior if stream=True
full_response_content = ""
for line in response.iter_lines():
if line:
try:
decoded_line = line.decode('utf-8')
json_chunk = json.loads(decoded_line)
# Accumulate response content if needed, e.g., from json_chunk.get('response', '')
full_response_content += json_chunk.get('response', '') # Example for generate
if json_chunk.get('done'):
# If using generate with stream=True, the last chunk might have final details
# For stream=False with generate, we expect one JSON object below
pass
except json.JSONDecodeError:
print(f" Warning: Could not decode JSON line: {line}")
# This needs careful implementation based on streaming needs.
# For simple generation (stream=False), the non-streaming part below is better.
# For simplicity, let's enforce stream=False for generate.
print(" Warning: Streaming response handling is basic. Recommend stream=False for generate.")
return {"response": full_response_content} # Example structure
else: # Not streaming (embeddings or generate with stream=False)
response_data = response.json()
# print(f" Received response: {str(response_data)[:100]}...") # Debug: Print part of response
return response_data
except requests.exceptions.Timeout:
print(f" Attempt {attempt + 1}/{MAX_RETRIES}: Request timed out.")
except requests.exceptions.RequestException as e:
print(f" Attempt {attempt + 1}/{MAX_RETRIES}: Request failed: {e}")
if attempt < MAX_RETRIES - 1:
print(f" Retrying in {RETRY_DELAY} seconds...")
time.sleep(RETRY_DELAY)
else:
print(f" Max retries reached. Failing request for: {payload_value[:50]}...")
return None # Indicate failure
# --- 1. Get Embeddings from Ollama ---
print(f"\n--- Getting Embeddings ({EMBEDDING_MODEL}) ---")
embeddings = []
valid_sentences_indices = [] # Keep track of sentences for which we got embeddings
for i1, sentence1 in enumerate(sentences):
print(f"Processing sentence {i1+1}/{len(sentences)}: '{sentence1[:40]}...'")
response_data = ollama_request("embeddings", EMBEDDING_MODEL, "prompt", sentence1)
if response_data and "embedding" in response_data:
embeddings.append(response_data["embedding"])
valid_sentences_indices.append(i1)
else:
print(f" Failed to get embedding for sentence {i1+1}. Skipping.")
# Filter sentences list to only include those with successful embeddings
sentences_filtered = [sentences[i] for i in valid_sentences_indices]
if not embeddings:
print("\nError: No embeddings were successfully retrieved. Exiting.")
exit()
embeddings_array = np.array(embeddings)
print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.")
return embeddings_array, ollama_request, sentences_filtered
@app.cell
def _(REDUCTION_METHOD, TSNE, embeddings_array, umap):
# --- 2. Dimensionality Reduction ---
print(f"\n--- Reducing Dimensionality ({REDUCTION_METHOD}) ---")
n_samples = embeddings_array.shape[0]
if n_samples <= 1:
print("\nError: Need at least 2 data points for dimensionality reduction.")
exit()
reduced_embeddings = None
if REDUCTION_METHOD == 'UMAP':
n_neighbors_value = min(15, n_samples - 1)
if n_neighbors_value < 2: n_neighbors_value = max(2, n_samples - 1)
print(f" Using UMAP with n_neighbors={n_neighbors_value}")
reducer = umap.UMAP(
n_neighbors=n_neighbors_value, n_components=2, min_dist=0.1,
metric='cosine', random_state=42
)
reduced_embeddings = reducer.fit_transform(embeddings_array)
elif REDUCTION_METHOD == 't-SNE':
perplexity_value = min(30, n_samples - 1)
if perplexity_value <= 0: perplexity_value = max(1, n_samples - 1)
print(f" Using t-SNE with perplexity={perplexity_value}")
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value,
init='pca', learning_rate='auto')
reduced_embeddings = tsne.fit_transform(embeddings_array)
if reduced_embeddings is None:
print("Error: Dimensionality reduction failed.")
exit()
print("Dimensionality reduction complete.")
return (reduced_embeddings,)
@app.cell
def _(ANALYSIS_MODEL, ollama_request, sentences_filtered, time):
# --- 3. Perform LLM Analysis (Sentiment & Topic) ---
print(f"\n--- Performing LLM Analysis ({ANALYSIS_MODEL}) ---")
analysis_results = []
possible_topics = ["Travel", "Food", "Tech", "Emotion", "Weather", "Work", "Other"] # Predefined topics
# Define prompts (Keep them simple for smaller models)
sentiment_prompt_template = """Rate the sentiment of the following sentence on a scale of 1 (very negative) to 5 (very positive). Output ONLY the number (1, 2, 3, 4, or 5).
Sentence: {}
Sentiment rating (1-5):"""
topic_prompt_template = f"""Categorize the main topic of the following sentence into ONE of these categories: {', '.join(possible_topics)}. Output ONLY the category name.
Sentence: {{}}
Topic category:"""
for i2, sentence2 in enumerate(sentences_filtered):
print(f"Analyzing sentence {i2+1}/{len(sentences_filtered)}: '{sentence2[:40]}...'")
sentence_analysis = {"sentence": sentence2, "sentiment_score": None, "topic": "Unknown"} # Defaults
# --- Sentiment Analysis ---
print(" Analyzing sentiment...")
sentiment_prompt = sentiment_prompt_template.format(sentence2)
sentiment_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", sentiment_prompt, stream=False)
if sentiment_response and "response" in sentiment_response:
llm_output = sentiment_response["response"].strip()
try:
# Try to extract just the number
score = int(llm_output.split()[0].strip('.,!?;:"\'')) # Basic extraction
if 1 <= score <= 5:
sentence_analysis["sentiment_score"] = score
print(f" Sentiment score: {score}")
else:
print(f" Warning: Sentiment score out of range ({score}). LLM Output: '{llm_output}'")
except (ValueError, IndexError):
print(f" Warning: Could not parse sentiment score. LLM Output: '{llm_output}'")
else:
print(" Failed to get sentiment response.")
# Add a small delay to avoid overwhelming the API
time.sleep(0.5)
# --- Topic Analysis ---
print(" Analyzing topic...")
topic_prompt = topic_prompt_template.format(sentence2)
topic_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", topic_prompt, stream=False)
if topic_response and "response" in topic_response:
llm_output = topic_response["response"].strip().capitalize() # Capitalize for consistency
# Find the best match from our predefined list
found_topic = "Unknown"
for topic in possible_topics:
if topic.lower() in llm_output.lower(): # Simple check if topic name is in output
found_topic = topic
break # Take the first match
sentence_analysis["topic"] = found_topic
print(f" Topic detected: {found_topic} (LLM Output: '{llm_output}')")
if found_topic == "Unknown":
print(f" Warning: Could not match topic to predefined list. LLM Output: '{llm_output}'")
else:
print(" Failed to get topic response.")
analysis_results.append(sentence_analysis)
time.sleep(0.5) # Another small delay
return (analysis_results,)
@app.cell
def _(analysis_results, pd, reduced_embeddings):
# --- 4. Prepare Data for Plotting ---
print("\n--- Preparing Data for Plotting ---")
df = pd.DataFrame(analysis_results)
df['x'] = reduced_embeddings[:, 0]
df['y'] = reduced_embeddings[:, 1]
df['index'] = [i + 1 for i in range(len(df))]
# Handle missing sentiment scores for coloring (e.g., set to neutral 3)
df['sentiment_score_filled'] = df['sentiment_score'].fillna(3).astype(int)
print("Analysis and data preparation complete.")
# Display the resulting DataFrame (optional)
# print("\nAnalysis DataFrame:")
# print(df)
return (df,)
@app.cell
def _(ANALYSIS_MODEL, EMBEDDING_MODEL, REDUCTION_METHOD, df, px):
# --- 5. Visualization with Plotly (Interactive & Colored) ---
def plot_interactive_analyzed(df, model_name, reduction_method_name="UMAP"):
"""
Generates an interactive scatter plot colored by sentiment, with hover info.
"""
# Define the color scale for sentiment (1=Red, 3=Yellow, 5=Green)
# Using Plotly's built-in RdYlGn is good for this.
color_scale = px.colors.diverging.RdYlGn
fig = px.scatter(
df,
x='x',
y='y',
color='sentiment_score_filled', # Color points by sentiment score
color_continuous_scale=color_scale, # Use the Red-Yellow-Green scale
range_color=[1, 5], # Ensure scale covers the full 1-5 range
color_continuous_midpoint=3, # Center the scale at neutral (3)
text='index', # Display index number on points
hover_name='index', # Show index number prominently on hover
hover_data={ # Configure hover tooltips
'sentence': True, # Show full sentence
'sentiment_score': True,# Show original sentiment score (can be None)
'topic': True, # Show detected topic
'x': False, # Hide coordinates
'y': False,
'index': False,
'sentiment_score_filled': False # Hide the filled value
},
title=f'Interactive 2D {reduction_method_name} of Sentence Embeddings ({model_name})<br>Colored by Sentiment (1-5), Analyzed by {ANALYSIS_MODEL}',
labels={'x': f'{reduction_method_name} Component 1',
'y': f'{reduction_method_name} Component 2',
'color': 'Sentiment Score'} # Label for the color bar
)
# --- Optional Customizations ---
fig.update_traces(
textposition='top center',
textfont_size=10,
marker=dict(size=8, # Slightly larger markers
line=dict(width=1, color='DarkSlateGrey')) # Add outline to markers
)
fig.update_layout(
coloraxis_colorbar=dict(
title="Sentiment Score",
tickvals=[1, 2, 3, 4, 5], # Define ticks on the color bar
ticktext=["1 (Neg)", "2", "3 (Neu)", "4", "5 (Pos)"], # Labels for ticks
),
hovermode='closest',
title_x=0.5
)
fig.show()
# --- Call the plotting function ---
plot_interactive_analyzed(df, EMBEDDING_MODEL, REDUCTION_METHOD)
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()