You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
3.7 KiB
Python
97 lines
3.7 KiB
Python
import requests
|
|
import json
|
|
import numpy as np
|
|
from sklearn.manifold import TSNE
|
|
import matplotlib.pyplot as plt
|
|
|
|
# --- Configuration ---
|
|
OLLAMA_URL = "http://localhost:11434/api/embeddings"
|
|
MODEL_NAME = "nomic-embed-text" # Make sure you have pulled this model
|
|
|
|
# --- Data: Sentences to Analyze ---
|
|
sentences = [
|
|
"The quick brown fox jumps over the lazy dog.",
|
|
"A fast, dark-colored mammal leaps above a sleepy canine.",
|
|
"Apples and oranges are common fruits.",
|
|
"Bananas grow in bunches.",
|
|
"The weather today is sunny and warm.",
|
|
"Expect rain and clouds tomorrow.",
|
|
"Python is a popular programming language.",
|
|
"Software development requires careful planning.",
|
|
]
|
|
|
|
# --- 1. Get Embeddings from Ollama ---
|
|
embeddings = []
|
|
print(f"Getting embeddings using model: {MODEL_NAME}...")
|
|
for i, sentence in enumerate(sentences):
|
|
print(f" Processing sentence {i+1}/{len(sentences)}: '{sentence[:30]}...'")
|
|
try:
|
|
payload = {
|
|
"model": MODEL_NAME,
|
|
"prompt": sentence
|
|
}
|
|
response = requests.post(OLLAMA_URL, json=payload)
|
|
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
|
|
|
|
# Parse the JSON response line by line if streaming, or directly if not
|
|
# Ollama's embedding API typically returns a single JSON object, not streamed
|
|
response_data = response.json()
|
|
embeddings.append(response_data.get("embedding"))
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"\nError connecting to Ollama or during API request: {e}")
|
|
print("Ensure Ollama is running and the model name is correct.")
|
|
exit()
|
|
except json.JSONDecodeError as e:
|
|
print(f"\nError decoding JSON response: {e}")
|
|
print(f"Received text: {response.text}")
|
|
exit()
|
|
except Exception as e:
|
|
print(f"\nAn unexpected error occurred for sentence '{sentence}': {e}")
|
|
exit()
|
|
|
|
# Check if we got any embeddings
|
|
if not embeddings or any(e is None for e in embeddings):
|
|
print("\nError: Failed to retrieve valid embeddings for some sentences.")
|
|
exit()
|
|
|
|
embeddings_array = np.array(embeddings)
|
|
print(f"\nSuccessfully got {embeddings_array.shape[0]} embeddings with dimension {embeddings_array.shape[1]}.")
|
|
|
|
# --- 2. Dimensionality Reduction (t-SNE) ---
|
|
print("Reducing dimensionality using t-SNE...")
|
|
# Adjust perplexity based on number of samples, must be less than n_samples
|
|
perplexity_value = min(30, len(sentences) - 1)
|
|
if perplexity_value <= 0:
|
|
print("\nError: Need at least 2 sentences for t-SNE.")
|
|
exit()
|
|
|
|
tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value, init='pca', learning_rate='auto')
|
|
reduced_embeddings = tsne.fit_transform(embeddings_array)
|
|
print("Dimensionality reduction complete.")
|
|
|
|
# --- 3. Visualization ---
|
|
print("Plotting results...")
|
|
plt.figure(figsize=(10, 8))
|
|
scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])
|
|
|
|
# Add labels to points
|
|
for i, sentence in enumerate(sentences):
|
|
plt.annotate(f"{i+1}", # Simple numeric label
|
|
(reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
|
|
textcoords="offset points",
|
|
xytext=(0, 5), # Offset the text slightly above the point
|
|
ha='center',
|
|
fontsize=8)
|
|
# Optional: Print mapping for reference
|
|
# print(f" Point {i+1}: '{sentence}'")
|
|
|
|
plt.title(f'2D t-SNE visualization of sentence embeddings ({MODEL_NAME})')
|
|
plt.xlabel('t-SNE Component 1')
|
|
plt.ylabel('t-SNE Component 2')
|
|
plt.grid(True, linestyle='--', alpha=0.6)
|
|
plt.figtext(0.01, 0.01, "\n".join([f"{i+1}: {s[:60]}..." for i, s in enumerate(sentences)]), fontsize=7) # Add legend text
|
|
plt.tight_layout(rect=[0, 0.1, 1, 1]) # Adjust layout to make space for legend
|
|
plt.show()
|
|
|
|
print("Done.") |