You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

162 lines
6.3 KiB
Python

#!/usr/bin/env python3
from llama_cpp import Llama
from bs4 import BeautifulSoup
import requests
import json
OUTPUT_FILE = "results.json"
HTML_EXAMPLE_1 = r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13 &#8211; 16, 2022.</p>"""
OUTPUT_EXAMPLE_1 = json.dumps({
"title": "Statistical and Computational Aspects of Dynamics",
"url": "http://www.crm.sns.it/event/507/",
"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).\n\nLocation: Centro De Giorgi SNS, Pisa",
"startDate": "2022-12-13",
"endDate": "2022-12-16"
})
HTML_EXAMPLE_2 = r"""<p><em><a href="http://pagine.dm.unipi.it/berselli/meeting2020/" rel="noreferrer noopener" target="_blank">Workshop on Variational problems, PDEs and applications<br/></a></em>Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 18, 2020.</p>"""
OUTPUT_EXAMPLE_2 = json.dumps({
"title": "Workshop on Variational problems, PDEs and applications",
"url": "http://pagine.dm.unipi.it/berselli/meeting2020/",
"description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).\n\nLocation: Department of Mathematics, Pisa",
"startDate": "2020-01-17",
"endDate": "2020-01-18",
})
HTML_EXAMPLE_3 = r"""<p>Geometric Representation Theory. ICM Satellite Conference<br/>Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 July 2, 2022.</p>"""
OUTPUT_EXAMPLE_3 = json.dumps({
"title": "Geometric Representation Theory. ICM Satellite Conference",
"url": None,
"description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).\n\nLocation: Online",
"startDate": "2022-06-27",
"endDate": "2022-07-02",
})
def translate_to_json(conference_html: str) -> str:
llm_answer = llm.create_chat_completion(
max_tokens=None,
messages=[
{
"role": "system",
"content": "You are an assistant helping a developer converting raw text data to JSON. Be precise in formatting the output and only output valid JSON using the specificied fields, without including any additional fields or comments",
},
# Example 1
{ "role": "user", "content": "INPUT:" },
{ "role": "user", "content": HTML_EXAMPLE_1 },
{ "role": "user", "content": "OUTPUT JSON:" },
{ "role": "user", "content": OUTPUT_EXAMPLE_1 },
# Example 2
{ "role": "user", "content": "INPUT:" },
{ "role": "user", "content": HTML_EXAMPLE_2 },
{ "role": "user", "content": "OUTPUT JSON:" },
{ "role": "user", "content": OUTPUT_EXAMPLE_2 },
# Example 3
{ "role": "user", "content": "INPUT:" },
{ "role": "user", "content": HTML_EXAMPLE_3 },
{ "role": "user", "content": "OUTPUT JSON:" },
{ "role": "user", "content": OUTPUT_EXAMPLE_3 },
# Actual item to process
{ "role": "user", "content": "INPUT:" },
{ "role": "user", "content": conference_html },
{ "role": "user", "content": "OUTPUT JSON:" },
],
)
return llm_answer["choices"][0]["message"]["content"]
def crawl_page(url):
print(f"Crawling {url}")
r = requests.get(url)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, "html.parser")
# Find the two elements
h2 = soup.find("h2", class_="wp-block-heading")
div = soup.find("div", class_="page-links")
# Extract all the elements between h2_tag and div_tag
if h2 and div:
result = []
current = h2.find_next_sibling()
while current and current != div:
if current.name is not None and current.text.strip():
result.append(str(current))
current = current.find_next_sibling()
print(f"Found {len(result)} conferences")
return result
else:
raise Exception("Failed to find elements")
else:
raise Exception("Failed to fetch")
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
9 months ago
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
# Load the model and, set the chat format and use the default model context length
9 months ago
llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
chat_format="llama-2",
verbose=False,
n_ctx=0,
)
# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")
for conference_html in conference_html_snippets:
9 months ago
print("--------------------------------------------------")
print("Translating:")
print(conference_html)
conference_json = translate_to_json(conference_html)
result = {
"input_html": conference_html,
"raw_output": conference_json,
# "json": None,
# "success": False,
}
print("Result:")
print(conference_json)
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
result["success"] = True
result["json"] = conference_object
except:
print("> json is invalid, skipping")
9 months ago
result["success"] = False
json.dump(result, results_file)
results_file.write("\n")
results_file.flush()
results_file.close()