You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
6.5 KiB
Python

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
from llama_cpp import Llama
from bs4 import BeautifulSoup
import requests
import json
OUTPUT_FILE = "results.json"
EXAMPLES = [
{
"input": r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13 &#8211; 16, 2022.</p>""",
"output": json.dumps({
"title": "Statistical and Computational Aspects of Dynamics",
"url": "http://www.crm.sns.it/event/507/",
"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).\n\nLocation: Centro De Giorgi SNS, Pisa",
"startDate": "2022-12-13",
"endDate": "2022-12-16"
}),
},
{
"input": r"""<p><em><a href="http://pagine.dm.unipi.it/berselli/meeting2020/" rel="noreferrer noopener" target="_blank">Workshop on Variational problems, PDEs and applications<br/></a></em>Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 18, 2020.</p>""",
"output": json.dumps({
"title": "Workshop on Variational problems, PDEs and applications",
"url": "http://pagine.dm.unipi.it/berselli/meeting2020/",
"description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).\n\nLocation: Department of Mathematics, Pisa",
"startDate": "2020-01-17",
"endDate": "2020-01-18",
}),
},
{
"input": r"""<p>Geometric Representation Theory. ICM Satellite Conference<br/>Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 July 2, 2022.</p>""",
"output": json.dumps({
"title": "Geometric Representation Theory. ICM Satellite Conference",
"url": None,
"description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).\n\nLocation: Online",
"startDate": "2022-06-27",
"endDate": "2022-07-02",
}),
},
{
"input": r"""<p><a href="https://events.dm.unipi.it/event/109/" rel="noreferrer noopener" target="_blank">Incontri di geometria algebrica ed aritmetica Milano Pisa<br/></a>Department of Mathematics, Pisa. November 16 17, 2022.</p>""",
"output": json.dumps({
"title": "Incontri di geometria algebrica ed aritmetica Milano Pisa",
"url": "https://events.dm.unipi.it/event/109/",
"description": "Location: Department of Mathematics, Pisa",
"startDate": "2022-11-16",
"endDate": "2022-11-17"
})
}
]
def translate_to_json(input_html: str) -> str:
llm_answer = llm.create_chat_completion(
max_tokens=None,
messages=[
{
"role": "system",
"content": "You are an assistant helping a developer converting raw text data to JSON. Output only valid JSON following the given examples, without including any additional notes or comments",
},
] + (
map(lambda example: ({
"role": "user",
"content": f"INPUT:\n{example["input"]}\n\nOUTPUT JSON:\n{example["output"]}"
}), EXAMPLES)
) + [
{
"role": "user",
"content": f"INPUT:\n{input_html}\n\nOUTPUT JSON:\n"
},
],
)
return llm_answer["choices"][0]["message"]["content"]
def crawl_page(url):
print(f"Crawling {url}")
r = requests.get(url)
if r.status_code == 200:
html = r.text
soup = BeautifulSoup(html, "html.parser")
# Find the two elements
h2 = soup.find("h2", class_="wp-block-heading")
div = soup.find("div", class_="page-links")
# Extract all the elements between h2_tag and div_tag
if h2 and div:
result = []
current = h2.find_next_sibling()
while current and current != div:
if current.name is not None and current.text.strip():
result.append(str(current))
current = current.find_next_sibling()
print(f"Found {len(result)} conferences")
return result
else:
raise Exception("Failed to find elements")
else:
raise Exception("Failed to fetch")
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
# Load the model and, set the chat format and use the default model context length
llm = Llama(
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
chat_format="llama-2",
verbose=False,
echo=True,
n_ctx=0,
)
# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")
for conference_html in conference_html_snippets:
print("--------------------------------------------------")
print("Translating:")
print(conference_html)
conference_json = translate_to_json(conference_html)
result = {
"input_html": conference_html,
"raw_output": conference_json,
# "json": None,
# "success": False,
}
print("Result:")
print(conference_json)
try:
# parse the result string into a json object to check correctness
conference_object = json.loads(conference_json)
result["success"] = True
result["json"] = conference_object
except:
print("> json is invalid, skipping")
result["success"] = False
json.dump(result, results_file)
results_file.write("\n")
results_file.flush()
results_file.close()