You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.

97 lines
3.7 KiB
Python

import requests
import json
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
# --- Configuration ---
OLLAMA_URL = "http://localhost:11434/api/embeddings"
MODEL_NAME = "nomic-embed-text" # Make sure you have pulled this model
# --- Data: Sentences to Analyze ---
sentences = [
"The quick brown fox jumps over the lazy dog.",
"A fast, dark-colored mammal leaps above a sleepy canine.",
"Apples and oranges are common fruits.",
"Bananas grow in bunches.",
"The weather today is sunny and warm.",
"Expect rain and clouds tomorrow.",
"Python is a popular programming language.",
"Software development requires careful planning.",
]
# --- 1. Get Embeddings from Ollama ---
embeddings = []
print(f"Getting embeddings using model: {MODEL_NAME}...")
for i, sentence in enumerate(sentences):
print(f" Processing sentence {i+1}/{len(sentences)}: '{sentence[:30]}...'")
try:
payload = {
"model": MODEL_NAME,
"prompt": sentence
}
response = requests.post(OLLAMA_URL, json=payload)
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
# Parse the JSON response line by line if streaming, or directly if not
# Ollama's embedding API typically returns a single JSON object, not streamed
response_data = response.json()
embeddings.append(response_data.get("embedding"))
except requests.exceptions.RequestException as e:
print(f"\nError connecting to Ollama or during API request: {e}")
print("Ensure Ollama is running and the model name is correct.")
exit()
except json.JSONDecodeError as e:
print(f"\nError decoding JSON response: {e}")
print(f"Received text: {response.text}")
exit()
except Exception as e:
print(f"\nAn unexpected error occurred for sentence '{sentence}': {e}")
exit()
# Check if we got any embeddings
if not embeddings or any(e is None for e in embeddings):
print("\nError: Failed to retrieve valid embeddings for some sentences.")
exit()
embeddings_array = np.array(embeddings)
print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.")
# --- 2. Dimensionality Reduction (t-SNE) ---
print("Reducing dimensionality using t-SNE...")
# Adjust perplexity based on number of samples, must be less than n_samples
perplexity_value = min(30, len(sentences) - 1)
if perplexity_value <= 0:
print("\nError: Need at least 2 sentences for t-SNE.")
exit()
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value, init='pca', learning_rate='auto')
reduced_embeddings = tsne.fit_transform(embeddings_array)
print("Dimensionality reduction complete.")
# --- 3. Visualization ---
print("Plotting results...")
plt.figure(figsize=(10, 8))
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])
# Add labels to points
for i, sentence in enumerate(sentences):
plt.annotate(f"{i+1}", # Simple numeric label
(reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
textcoords="offset points",
xytext=(0, 5), # Offset the text slightly above the point
ha='center',
fontsize=8)
# Optional: Print mapping for reference
# print(f" Point {i+1}: '{sentence}'")
plt.title(f'2D t-SNE visualization of sentence embeddings ({MODEL_NAME})')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True, linestyle='--', alpha=0.6)
plt.figtext(0.01, 0.01, "\n".join([f"{i+1}: {s[:60]}..." for i, s in enumerate(sentences)]), fontsize=7) # Add legend text
plt.tight_layout(rect=[0, 0.1, 1, 1]) # Adjust layout to make space for legend
plt.show()
print("Done.")