semantic-analysis/main.py

import requests
import json
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# --- Configuration ---
OLLAMA_URL = "http://localhost:11434/api/embeddings"
MODEL_NAME = "nomic-embed-text" # Make sure you have pulled this model

# --- Data: Sentences to Analyze ---
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast, dark-colored mammal leaps above a sleepy canine.",
    "Apples and oranges are common fruits.",
    "Bananas grow in bunches.",
    "The weather today is sunny and warm.",
    "Expect rain and clouds tomorrow.",
    "Python is a popular programming language.",
    "Software development requires careful planning.",
]

# --- 1. Get Embeddings from Ollama ---
embeddings = []
print(f"Getting embeddings using model: {MODEL_NAME}...")
for i, sentence in enumerate(sentences):
    print(f"  Processing sentence {i+1}/{len(sentences)}: '{sentence[:30]}...'")
    try:
        payload = {
            "model": MODEL_NAME,
            "prompt": sentence
        }
        response = requests.post(OLLAMA_URL, json=payload)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        # Parse the JSON response line by line if streaming, or directly if not
        # Ollama's embedding API typically returns a single JSON object, not streamed
        response_data = response.json()
        embeddings.append(response_data.get("embedding"))

    except requests.exceptions.RequestException as e:
        print(f"\nError connecting to Ollama or during API request: {e}")
        print("Ensure Ollama is running and the model name is correct.")
        exit()
    except json.JSONDecodeError as e:
        print(f"\nError decoding JSON response: {e}")
        print(f"Received text: {response.text}")
        exit()
    except Exception as e:
        print(f"\nAn unexpected error occurred for sentence '{sentence}': {e}")
        exit()

# Check if we got any embeddings
if not embeddings or any(e is None for e in embeddings):
    print("\nError: Failed to retrieve valid embeddings for some sentences.")
    exit()

embeddings_array = np.array(embeddings)
print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.")

# --- 2. Dimensionality Reduction (t-SNE) ---
print("Reducing dimensionality using t-SNE...")
# Adjust perplexity based on number of samples, must be less than n_samples
perplexity_value = min(30, len(sentences) - 1)
if perplexity_value <= 0:
     print("\nError: Need at least 2 sentences for t-SNE.")
     exit()

tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value, init='pca', learning_rate='auto')
reduced_embeddings = tsne.fit_transform(embeddings_array)
print("Dimensionality reduction complete.")

# --- 3. Visualization ---
print("Plotting results...")
plt.figure(figsize=(10, 8))
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])

# Add labels to points
for i, sentence in enumerate(sentences):
    plt.annotate(f"{i+1}", # Simple numeric label
                 (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 textcoords="offset points",
                 xytext=(0, 5), # Offset the text slightly above the point
                 ha='center',
                 fontsize=8)
    # Optional: Print mapping for reference
    # print(f"  Point {i+1}: '{sentence}'")

plt.title(f'2D t-SNE visualization of sentence embeddings ({MODEL_NAME})')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True, linestyle='--', alpha=0.6)
plt.figtext(0.01, 0.01, "\n".join([f"{i+1}: {s[:60]}..." for i, s in enumerate(sentences)]), fontsize=7) # Add legend text
plt.tight_layout(rect=[0, 0.1, 1, 1]) # Adjust layout to make space for legend
plt.show()

print("Done.")