#!/usr/bin/env python3
from llama_cpp import Llama
from bs4 import BeautifulSoup
import requests
import json
from datetime import datetime
current_date = datetime.now().strftime("%Y-%m-%d_%H-%M")
OUTPUT_FILE = f"results_{current_date}.json"
EXAMPLES = [
{
"input": r"""
Statistical and Computational Aspects of Dynamics
Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13 – 16, 2022.
""",
"output": json.dumps({
"title": "Statistical and Computational Aspects of Dynamics",
"url": "http://www.crm.sns.it/event/507/",
"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).",
"location": "Centro De Giorgi – SNS, Pisa",
"startDate": "2022-12-13",
"endDate": "2022-12-16"
}),
},
{
"input": r"""Workshop on Variational problems, PDEs and applications
Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 – 18, 2020.
""",
"output": json.dumps({
"title": "Workshop on Variational problems, PDEs and applications",
"url": "http://pagine.dm.unipi.it/berselli/meeting2020/",
"description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).",
"location": "Department of Mathematics, Pisa",
"startDate": "2020-01-17",
"endDate": "2020-01-18",
}),
},
{
"input": r"""Geometric Representation Theory. ICM Satellite Conference
Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 – July 2, 2022.
""",
"output": json.dumps({
"title": "Geometric Representation Theory. ICM Satellite Conference",
"url": None,
"description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).",
"location": "Online",
"startDate": "2022-06-27",
"endDate": "2022-07-02",
}),
},
{
"input": r"""Incontri di geometria algebrica ed aritmetica Milano – Pisa
Department of Mathematics, Pisa. November 16 – 17, 2022.
""",
"output": json.dumps({
"title": "Incontri di geometria algebrica ed aritmetica Milano – Pisa",
"url": "https://events.dm.unipi.it/event/109/",
"description": "",
"location": "Department of Mathematics, Pisa",
"startDate": "2022-11-16",
"endDate": "2022-11-17"
})
},
{
"input": r"""A journey in numerical linear algebra: a workshop in honor of Michele Benzi’s 60th birthday
Organized by Francesca Arrigo (University of Strathclyde), Paola Boito, Christine Klymko (Lawrence Livermore National Laboratory), Beatrice Meini, and Leonardo Robol. Department of Mathematics, Pisa. June 10 – 11, 2022.
""",
"output": json.dumps({
"title": "A journey in numerical linear algebra: a workshop in honor of Michele Benzi’s 60th birthday",
"url": "https://events.dm.unipi.it/event/75/",
"description": "Organized by Francesca Arrigo (University of Strathclyde), Paola Boito, Christine Klymko (Lawrence Livermore National Laboratory), Beatrice Meini, and Leonardo Robol.",
"location": "Department of Mathematics, Pisa",
"startDate": "2022-06-10",
"endDate": "2022-06-11"
})
}
]
def translate_to_json(input_html: str) -> str:
llm_answer = llm.create_chat_completion(
max_tokens=None,
messages=[
{
"role": "system",
"content": "You are an assistant helping a developer converting raw text data to JSON. Output only valid compact inline JSON following the given examples, without including any additional notes or comments",
},
] + list(
map(lambda example: ({
"role": "user",
"content": f"""INPUT:\n{example["input"]}\n\nOUTPUT JSON:\n{example["output"]}"""
}), EXAMPLES)
) + [
{
"role": "user",
"content": f"INPUT:\n{input_html}\n\nOUTPUT JSON:\n"
},
],
)
return llm_answer["choices"][0]["message"]["content"]
def crawl_page(url):
print(f"Crawling {url}")
r = requests.get(url)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, "html.parser")
# Find the two elements
h2 = soup.find("h2", class_="wp-block-heading")
div = soup.find("div", class_="page-links")
# Extract all the elements between h2_tag and div_tag
if h2 and div:
result = []
current = h2.find_next_sibling()
while current and current != div:
if current.name is not None and current.text.strip():
result.append(str(current))
current = current.find_next_sibling()
print(f"Found {len(result)} conferences")
return result
else:
raise Exception("Failed to find elements")
else:
raise Exception("Failed to fetch")
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
# Load the model and, set the chat format and use the default model context length
llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
chat_format="llama-2",
verbose=False,
n_ctx=0,
)
# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")
for conference_html in conference_html_snippets:
print("--------------------------------------------------")
print("Translating:")
print(conference_html)
conference_json = translate_to_json(conference_html)
result = {
"input_html": conference_html,
"raw_output": conference_json,
# "json": None,
# "success": False,
}
print("Result:")
print(conference_json)
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
result["success"] = True
result["json"] = conference_object
except:
print("> json is invalid, skipping")
result["success"] = False
json.dump(result, results_file)
results_file.write("\n")
results_file.flush()
results_file.close()