You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
366 lines
16 KiB
Python
366 lines
16 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.13.7"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import requests
|
|
import json
|
|
import numpy as np
|
|
import pandas as pd
|
|
import time # To add delays
|
|
|
|
# Choose your dimensionality reduction method
|
|
# Option 1: UMAP (Recommended)
|
|
try:
|
|
import umap
|
|
REDUCTION_METHOD = 'UMAP'
|
|
except ImportError:
|
|
print("UMAP not found, falling back to t-SNE. Install with: pip install umap-learn")
|
|
from sklearn.manifold import TSNE
|
|
REDUCTION_METHOD = 't-SNE'
|
|
|
|
# Option 2: Force t-SNE
|
|
# from sklearn.manifold import TSNE
|
|
# REDUCTION_METHOD = 't-SNE'
|
|
|
|
import plotly.express as px
|
|
import plotly.graph_objects as go # For more control if needed
|
|
return REDUCTION_METHOD, TSNE, json, np, pd, px, requests, time, umap
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
# --- Configuration ---
|
|
OLLAMA_API_BASE_URL = "http://localhost:11434/api"
|
|
EMBEDDING_MODEL = "nomic-embed-text" # For generating embeddings
|
|
ANALYSIS_MODEL = "gemma3:1b" # LLM for sentiment/topic analysis (ensure pulled)
|
|
REQUEST_TIMEOUT = 60 # Timeout for Ollama requests (seconds)
|
|
RETRY_DELAY = 5 # Delay between retries (seconds)
|
|
MAX_RETRIES = 3 # Max retries for Ollama requests
|
|
return (
|
|
ANALYSIS_MODEL,
|
|
EMBEDDING_MODEL,
|
|
MAX_RETRIES,
|
|
OLLAMA_API_BASE_URL,
|
|
REQUEST_TIMEOUT,
|
|
RETRY_DELAY,
|
|
)
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
# --- Data: Sentences to Analyze (Focus on Sentiment & Topics) ---
|
|
sentences = [
|
|
# Positive Sentiments
|
|
"I had an absolutely wonderful time on vacation!", # Travel, Positive
|
|
"This is the best pizza I've ever tasted, truly amazing.",# Food, Positive
|
|
"The new software update significantly improved performance.",# Tech, Positive
|
|
"She was overjoyed to receive the award.", # Emotion, Positive
|
|
"What a beautiful, sunny day for a picnic!", # Weather/Activity, Positive
|
|
|
|
# Negative Sentiments
|
|
"The airline lost my luggage, ruining the start of my trip.", # Travel, Negative
|
|
"I found the meal to be bland and overpriced.", # Food, Negative
|
|
"Debugging this legacy code is incredibly frustrating.", # Tech, Negative
|
|
"He felt heartbroken and betrayed after the argument.", # Emotion, Negative
|
|
"The constant rain made the whole weekend gloomy.", # Weather/Activity, Negative
|
|
|
|
# Neutral/Objective Sentiments
|
|
"The train is scheduled to depart at 3:00 PM.", # Travel, Neutral
|
|
"The ingredients listed include flour, water, and salt.",# Food, Neutral
|
|
"The system requires 8GB of RAM to operate.", # Tech, Neutral
|
|
"Please file the report by the end of the day.", # Work/Instruction, Neutral/Work
|
|
"The weather report indicates a chance of showers tomorrow.",# Weather, Neutral
|
|
]
|
|
return (sentences,)
|
|
|
|
|
|
@app.cell
|
|
def _(
|
|
EMBEDDING_MODEL,
|
|
MAX_RETRIES,
|
|
OLLAMA_API_BASE_URL,
|
|
REQUEST_TIMEOUT,
|
|
RETRY_DELAY,
|
|
json,
|
|
np,
|
|
requests,
|
|
sentences,
|
|
time,
|
|
):
|
|
# --- Helper Function for Ollama API Calls ---
|
|
def ollama_request(endpoint, model_name, payload_key, payload_value, stream=False):
|
|
"""Sends a request to a specified Ollama endpoint and handles retries."""
|
|
url = f"{OLLAMA_API_BASE_URL}/{endpoint}"
|
|
payload = {
|
|
"model": model_name,
|
|
payload_key: payload_value,
|
|
"stream": stream # Important for generate endpoint if expecting single response
|
|
}
|
|
print(f" Sending request to {url} for model {model_name}...") # Debug info
|
|
|
|
for attempt in range(MAX_RETRIES):
|
|
try:
|
|
response = requests.post(url, json=payload, timeout=REQUEST_TIMEOUT)
|
|
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
|
|
|
|
# Handle potential streaming vs non-streaming (generate vs embeddings)
|
|
if stream:
|
|
# Process line-by-line if needed, for now assume generate returns single JSON when stream=False
|
|
# This part might need adjustment depending on exact API behavior if stream=True
|
|
full_response_content = ""
|
|
for line in response.iter_lines():
|
|
if line:
|
|
try:
|
|
decoded_line = line.decode('utf-8')
|
|
json_chunk = json.loads(decoded_line)
|
|
# Accumulate response content if needed, e.g., from json_chunk.get('response', '')
|
|
full_response_content += json_chunk.get('response', '') # Example for generate
|
|
if json_chunk.get('done'):
|
|
# If using generate with stream=True, the last chunk might have final details
|
|
# For stream=False with generate, we expect one JSON object below
|
|
pass
|
|
except json.JSONDecodeError:
|
|
print(f" Warning: Could not decode JSON line: {line}")
|
|
# This needs careful implementation based on streaming needs.
|
|
# For simple generation (stream=False), the non-streaming part below is better.
|
|
# For simplicity, let's enforce stream=False for generate.
|
|
print(" Warning: Streaming response handling is basic. Recommend stream=False for generate.")
|
|
return {"response": full_response_content} # Example structure
|
|
|
|
else: # Not streaming (embeddings or generate with stream=False)
|
|
response_data = response.json()
|
|
# print(f" Received response: {str(response_data)[:100]}...") # Debug: Print part of response
|
|
return response_data
|
|
|
|
except requests.exceptions.Timeout:
|
|
print(f" Attempt {attempt + 1}/{MAX_RETRIES}: Request timed out.")
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" Attempt {attempt + 1}/{MAX_RETRIES}: Request failed: {e}")
|
|
|
|
if attempt < MAX_RETRIES - 1:
|
|
print(f" Retrying in {RETRY_DELAY} seconds...")
|
|
time.sleep(RETRY_DELAY)
|
|
else:
|
|
print(f" Max retries reached. Failing request for: {payload_value[:50]}...")
|
|
return None # Indicate failure
|
|
|
|
# --- 1. Get Embeddings from Ollama ---
|
|
print(f"\n--- Getting Embeddings ({EMBEDDING_MODEL}) ---")
|
|
embeddings = []
|
|
valid_sentences_indices = [] # Keep track of sentences for which we got embeddings
|
|
for i1, sentence1 in enumerate(sentences):
|
|
print(f"Processing sentence {i1+1}/{len(sentences)}: '{sentence1[:40]}...'")
|
|
response_data = ollama_request("embeddings", EMBEDDING_MODEL, "prompt", sentence1)
|
|
if response_data and "embedding" in response_data:
|
|
embeddings.append(response_data["embedding"])
|
|
valid_sentences_indices.append(i1)
|
|
else:
|
|
print(f" Failed to get embedding for sentence {i1+1}. Skipping.")
|
|
|
|
# Filter sentences list to only include those with successful embeddings
|
|
sentences_filtered = [sentences[i] for i in valid_sentences_indices]
|
|
if not embeddings:
|
|
print("\nError: No embeddings were successfully retrieved. Exiting.")
|
|
exit()
|
|
|
|
embeddings_array = np.array(embeddings)
|
|
print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.")
|
|
return embeddings_array, ollama_request, sentences_filtered
|
|
|
|
|
|
@app.cell
|
|
def _(REDUCTION_METHOD, TSNE, embeddings_array, umap):
|
|
# --- 2. Dimensionality Reduction ---
|
|
print(f"\n--- Reducing Dimensionality ({REDUCTION_METHOD}) ---")
|
|
n_samples = embeddings_array.shape[0]
|
|
if n_samples <= 1:
|
|
print("\nError: Need at least 2 data points for dimensionality reduction.")
|
|
exit()
|
|
|
|
reduced_embeddings = None
|
|
if REDUCTION_METHOD == 'UMAP':
|
|
n_neighbors_value = min(15, n_samples - 1)
|
|
if n_neighbors_value < 2: n_neighbors_value = max(2, n_samples - 1)
|
|
print(f" Using UMAP with n_neighbors={n_neighbors_value}")
|
|
reducer = umap.UMAP(
|
|
n_neighbors=n_neighbors_value, n_components=2, min_dist=0.1,
|
|
metric='cosine', random_state=42
|
|
)
|
|
reduced_embeddings = reducer.fit_transform(embeddings_array)
|
|
elif REDUCTION_METHOD == 't-SNE':
|
|
perplexity_value = min(30, n_samples - 1)
|
|
if perplexity_value <= 0: perplexity_value = max(1, n_samples - 1)
|
|
print(f" Using t-SNE with perplexity={perplexity_value}")
|
|
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value,
|
|
init='pca', learning_rate='auto')
|
|
reduced_embeddings = tsne.fit_transform(embeddings_array)
|
|
|
|
if reduced_embeddings is None:
|
|
print("Error: Dimensionality reduction failed.")
|
|
exit()
|
|
|
|
print("Dimensionality reduction complete.")
|
|
return (reduced_embeddings,)
|
|
|
|
|
|
@app.cell
|
|
def _(ANALYSIS_MODEL, ollama_request, sentences_filtered, time):
|
|
# --- 3. Perform LLM Analysis (Sentiment & Topic) ---
|
|
print(f"\n--- Performing LLM Analysis ({ANALYSIS_MODEL}) ---")
|
|
analysis_results = []
|
|
possible_topics = ["Travel", "Food", "Tech", "Emotion", "Weather", "Work", "Other"] # Predefined topics
|
|
|
|
# Define prompts (Keep them simple for smaller models)
|
|
sentiment_prompt_template = """Rate the sentiment of the following sentence on a scale of 1 (very negative) to 5 (very positive). Output ONLY the number (1, 2, 3, 4, or 5).
|
|
Sentence: {}
|
|
Sentiment rating (1-5):"""
|
|
|
|
topic_prompt_template = f"""Categorize the main topic of the following sentence into ONE of these categories: {', '.join(possible_topics)}. Output ONLY the category name.
|
|
Sentence: {{}}
|
|
Topic category:"""
|
|
|
|
for i2, sentence2 in enumerate(sentences_filtered):
|
|
print(f"Analyzing sentence {i2+1}/{len(sentences_filtered)}: '{sentence2[:40]}...'")
|
|
sentence_analysis = {"sentence": sentence2, "sentiment_score": None, "topic": "Unknown"} # Defaults
|
|
|
|
# --- Sentiment Analysis ---
|
|
print(" Analyzing sentiment...")
|
|
sentiment_prompt = sentiment_prompt_template.format(sentence2)
|
|
sentiment_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", sentiment_prompt, stream=False)
|
|
|
|
if sentiment_response and "response" in sentiment_response:
|
|
llm_output = sentiment_response["response"].strip()
|
|
try:
|
|
# Try to extract just the number
|
|
score = int(llm_output.split()[0].strip('.,!?;:"\'')) # Basic extraction
|
|
if 1 <= score <= 5:
|
|
sentence_analysis["sentiment_score"] = score
|
|
print(f" Sentiment score: {score}")
|
|
else:
|
|
print(f" Warning: Sentiment score out of range ({score}). LLM Output: '{llm_output}'")
|
|
except (ValueError, IndexError):
|
|
print(f" Warning: Could not parse sentiment score. LLM Output: '{llm_output}'")
|
|
else:
|
|
print(" Failed to get sentiment response.")
|
|
|
|
# Add a small delay to avoid overwhelming the API
|
|
time.sleep(0.5)
|
|
|
|
# --- Topic Analysis ---
|
|
print(" Analyzing topic...")
|
|
topic_prompt = topic_prompt_template.format(sentence2)
|
|
topic_response = ollama_request("generate", ANALYSIS_MODEL, "prompt", topic_prompt, stream=False)
|
|
|
|
if topic_response and "response" in topic_response:
|
|
llm_output = topic_response["response"].strip().capitalize() # Capitalize for consistency
|
|
# Find the best match from our predefined list
|
|
found_topic = "Unknown"
|
|
for topic in possible_topics:
|
|
if topic.lower() in llm_output.lower(): # Simple check if topic name is in output
|
|
found_topic = topic
|
|
break # Take the first match
|
|
sentence_analysis["topic"] = found_topic
|
|
print(f" Topic detected: {found_topic} (LLM Output: '{llm_output}')")
|
|
|
|
if found_topic == "Unknown":
|
|
print(f" Warning: Could not match topic to predefined list. LLM Output: '{llm_output}'")
|
|
else:
|
|
print(" Failed to get topic response.")
|
|
|
|
analysis_results.append(sentence_analysis)
|
|
time.sleep(0.5) # Another small delay
|
|
return (analysis_results,)
|
|
|
|
|
|
@app.cell
|
|
def _(analysis_results, pd, reduced_embeddings):
|
|
# --- 4. Prepare Data for Plotting ---
|
|
print("\n--- Preparing Data for Plotting ---")
|
|
df = pd.DataFrame(analysis_results)
|
|
df['x'] = reduced_embeddings[:, 0]
|
|
df['y'] = reduced_embeddings[:, 1]
|
|
df['index'] = [i + 1 for i in range(len(df))]
|
|
|
|
# Handle missing sentiment scores for coloring (e.g., set to neutral 3)
|
|
df['sentiment_score_filled'] = df['sentiment_score'].fillna(3).astype(int)
|
|
|
|
print("Analysis and data preparation complete.")
|
|
# Display the resulting DataFrame (optional)
|
|
# print("\nAnalysis DataFrame:")
|
|
# print(df)
|
|
return (df,)
|
|
|
|
|
|
@app.cell
|
|
def _(ANALYSIS_MODEL, EMBEDDING_MODEL, REDUCTION_METHOD, df, px):
|
|
# --- 5. Visualization with Plotly (Interactive & Colored) ---
|
|
|
|
def plot_interactive_analyzed(df, model_name, reduction_method_name="UMAP"):
|
|
"""
|
|
Generates an interactive scatter plot colored by sentiment, with hover info.
|
|
"""
|
|
# Define the color scale for sentiment (1=Red, 3=Yellow, 5=Green)
|
|
# Using Plotly's built-in RdYlGn is good for this.
|
|
color_scale = px.colors.diverging.RdYlGn
|
|
|
|
fig = px.scatter(
|
|
df,
|
|
x='x',
|
|
y='y',
|
|
color='sentiment_score_filled', # Color points by sentiment score
|
|
color_continuous_scale=color_scale, # Use the Red-Yellow-Green scale
|
|
range_color=[1, 5], # Ensure scale covers the full 1-5 range
|
|
color_continuous_midpoint=3, # Center the scale at neutral (3)
|
|
text='index', # Display index number on points
|
|
hover_name='index', # Show index number prominently on hover
|
|
hover_data={ # Configure hover tooltips
|
|
'sentence': True, # Show full sentence
|
|
'sentiment_score': True,# Show original sentiment score (can be None)
|
|
'topic': True, # Show detected topic
|
|
'x': False, # Hide coordinates
|
|
'y': False,
|
|
'index': False,
|
|
'sentiment_score_filled': False # Hide the filled value
|
|
},
|
|
title=f'Interactive 2D {reduction_method_name} of Sentence Embeddings ({model_name})<br>Colored by Sentiment (1-5), Analyzed by {ANALYSIS_MODEL}',
|
|
labels={'x': f'{reduction_method_name} Component 1',
|
|
'y': f'{reduction_method_name} Component 2',
|
|
'color': 'Sentiment Score'} # Label for the color bar
|
|
)
|
|
|
|
# --- Optional Customizations ---
|
|
fig.update_traces(
|
|
textposition='top center',
|
|
textfont_size=10,
|
|
marker=dict(size=8, # Slightly larger markers
|
|
line=dict(width=1, color='DarkSlateGrey')) # Add outline to markers
|
|
)
|
|
fig.update_layout(
|
|
coloraxis_colorbar=dict(
|
|
title="Sentiment Score",
|
|
tickvals=[1, 2, 3, 4, 5], # Define ticks on the color bar
|
|
ticktext=["1 (Neg)", "2", "3 (Neu)", "4", "5 (Pos)"], # Labels for ticks
|
|
),
|
|
hovermode='closest',
|
|
title_x=0.5
|
|
)
|
|
|
|
fig.show()
|
|
|
|
# --- Call the plotting function ---
|
|
plot_interactive_analyzed(df, EMBEDDING_MODEL, REDUCTION_METHOD)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|