You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

127 lines
4.1 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
from llama_cpp import Llama
from bs4 import BeautifulSoup
import requests
import json
OUTPUT_FILE = "results.json"
HTML_EXAMPLE = r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13 &#8211; 16, 2022.</p>"""
OUTPUT_EXAMPLE = json.dumps({
"title": "Statistical and Computational Aspects of Dynamics",
"url": "http://www.crm.sns.it/event/507/",
"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Location: Centro De Giorgi - SNS, Pisa.",
"startDate": "2022-12-13",
"endDate": "2022-12-16"
})
def translate_to_json(conference_html: str) -> str:
llm_answer = llm.create_chat_completion(
max_tokens=None,
messages=[
{
"role": "system",
"content": "You are an assistant. Be precise in formatting the output and only output valid JSON using the specificied fields, without including additional fields or comments.",
},
{"role": "user", "content": "INPUT:"},
{"role": "user", "content": HTML_EXAMPLE },
{"role": "user", "content": "OUTPUT JSON:"},
{"role": "user", "content": OUTPUT_EXAMPLE},
{"role": "user", "content": "INPUT:"},
{"role": "user", "content": conference_html },
{"role": "user", "content": "OUTPUT JSON:"},
],
)
return llm_answer["choices"][0]["message"]["content"]
def crawl_page(url):
print(f"Crawling {url}")
r = requests.get(url)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, "html.parser")
# Find the two elements
h2 = soup.find("h2", class_="wp-block-heading")
div = soup.find("div", class_="page-links")
# Extract all the elements between h2_tag and div_tag
if h2 and div:
result = []
current = h2.find_next_sibling()
while current and current != div:
if current.name is not None and current.text.strip():
result.append(str(current))
current = current.find_next_sibling()
print(f"Found {len(result)} conferences")
return result
else:
raise Exception("Failed to find elements")
else:
raise Exception("Failed to fetch")
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
# Load the model and, set the chat format and use the default model context length
llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
chat_format="llama-2",
verbose=False,
n_ctx=0,
)
# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")
for conference_html in conference_html_snippets:
print("--------------------------------------------------")
print("Translating:")
print(conference_html)
conference_json = translate_to_json(conference_html)
result = {
"input_html": conference_html,
"raw_output": conference_json,
# "json": None,
# "success": False,
}
print("Result:")
print(conference_json)
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
result["success"] = True
result["json"] = conference_object
except:
print("> json is invalid, skipping")
result["success"] = False
json.dump(result, results_file)
results_file.write("\n")
results_file.flush()
results_file.close()
failed_json.close()