|
|
#!/usr/bin/env python3
|
|
|
|
|
|
from llama_cpp import Llama
|
|
|
from bs4 import BeautifulSoup
|
|
|
import requests
|
|
|
import json
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
|
current_date = datetime.now().strftime("%Y-%m-%d_%H-%M")
|
|
|
OUTPUT_FILE = f"results_{current_date}.json"
|
|
|
|
|
|
|
|
|
EXAMPLES = [
|
|
|
{
|
|
|
"input": r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13 – 16, 2022.</p>""",
|
|
|
"output": json.dumps({
|
|
|
"title": "Statistical and Computational Aspects of Dynamics",
|
|
|
"url": "http://www.crm.sns.it/event/507/",
|
|
|
"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).\n\nLocation: Centro De Giorgi – SNS, Pisa",
|
|
|
"startDate": "2022-12-13",
|
|
|
"endDate": "2022-12-16"
|
|
|
}),
|
|
|
},
|
|
|
{
|
|
|
"input": r"""<p><em><a href="http://pagine.dm.unipi.it/berselli/meeting2020/" rel="noreferrer noopener" target="_blank">Workshop on Variational problems, PDEs and applications<br/></a></em>Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 – 18, 2020.</p>""",
|
|
|
"output": json.dumps({
|
|
|
"title": "Workshop on Variational problems, PDEs and applications",
|
|
|
"url": "http://pagine.dm.unipi.it/berselli/meeting2020/",
|
|
|
"description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).\n\nLocation: Department of Mathematics, Pisa",
|
|
|
"startDate": "2020-01-17",
|
|
|
"endDate": "2020-01-18",
|
|
|
}),
|
|
|
},
|
|
|
{
|
|
|
"input": r"""<p>Geometric Representation Theory. ICM Satellite Conference<br/>Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 – July 2, 2022.</p>""",
|
|
|
"output": json.dumps({
|
|
|
"title": "Geometric Representation Theory. ICM Satellite Conference",
|
|
|
"url": None,
|
|
|
"description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).\n\nLocation: Online",
|
|
|
"startDate": "2022-06-27",
|
|
|
"endDate": "2022-07-02",
|
|
|
}),
|
|
|
},
|
|
|
{
|
|
|
"input": r"""<p><a href="https://events.dm.unipi.it/event/109/" rel="noreferrer noopener" target="_blank">Incontri di geometria algebrica ed aritmetica Milano – Pisa<br/></a>Department of Mathematics, Pisa. November 16 – 17, 2022.</p>""",
|
|
|
"output": json.dumps({
|
|
|
"title": "Incontri di geometria algebrica ed aritmetica Milano – Pisa",
|
|
|
"url": "https://events.dm.unipi.it/event/109/",
|
|
|
"description": "Location: Department of Mathematics, Pisa",
|
|
|
"startDate": "2022-11-16",
|
|
|
"endDate": "2022-11-17"
|
|
|
})
|
|
|
}
|
|
|
]
|
|
|
|
|
|
def translate_to_json(input_html: str) -> str:
|
|
|
llm_answer = llm.create_chat_completion(
|
|
|
max_tokens=None,
|
|
|
messages=[
|
|
|
{
|
|
|
"role": "system",
|
|
|
"content": "You are an assistant helping a developer converting raw text data to JSON. Output only valid JSON following the given examples, without including any additional notes or comments",
|
|
|
},
|
|
|
] + list(
|
|
|
map(lambda example: ({
|
|
|
"role": "user",
|
|
|
"content": f"""INPUT:\n{example["input"]}\n\nOUTPUT JSON:\n{example["output"]}"""
|
|
|
}), EXAMPLES)
|
|
|
) + [
|
|
|
{
|
|
|
"role": "user",
|
|
|
"content": f"INPUT:\n{input_html}\n\nOUTPUT JSON:\n"
|
|
|
},
|
|
|
],
|
|
|
)
|
|
|
|
|
|
return llm_answer["choices"][0]["message"]["content"]
|
|
|
|
|
|
|
|
|
def crawl_page(url):
|
|
|
print(f"Crawling {url}")
|
|
|
r = requests.get(url)
|
|
|
|
|
|
if r.status_code == 200:
|
|
|
html = r.text
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
|
|
# Find the two elements
|
|
|
h2 = soup.find("h2", class_="wp-block-heading")
|
|
|
div = soup.find("div", class_="page-links")
|
|
|
|
|
|
# Extract all the elements between h2_tag and div_tag
|
|
|
if h2 and div:
|
|
|
result = []
|
|
|
|
|
|
current = h2.find_next_sibling()
|
|
|
while current and current != div:
|
|
|
if current.name is not None and current.text.strip():
|
|
|
result.append(str(current))
|
|
|
current = current.find_next_sibling()
|
|
|
|
|
|
print(f"Found {len(result)} conferences")
|
|
|
return result
|
|
|
else:
|
|
|
raise Exception("Failed to find elements")
|
|
|
|
|
|
else:
|
|
|
raise Exception("Failed to fetch")
|
|
|
|
|
|
|
|
|
baseurl = "https://www.dm.unipi.it/research/past-conferences/"
|
|
|
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
|
|
|
|
|
|
conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
|
|
|
|
|
|
# Load the model and, set the chat format and use the default model context length
|
|
|
llm = Llama(
|
|
|
model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
|
|
|
chat_format="llama-2",
|
|
|
verbose=False,
|
|
|
echo=True,
|
|
|
n_ctx=0,
|
|
|
)
|
|
|
|
|
|
# the result file is a sequence of json objects, one per line
|
|
|
results_file = open(OUTPUT_FILE, "w")
|
|
|
|
|
|
for conference_html in conference_html_snippets:
|
|
|
print("--------------------------------------------------")
|
|
|
print("Translating:")
|
|
|
print(conference_html)
|
|
|
|
|
|
conference_json = translate_to_json(conference_html)
|
|
|
|
|
|
result = {
|
|
|
"input_html": conference_html,
|
|
|
"raw_output": conference_json,
|
|
|
# "json": None,
|
|
|
# "success": False,
|
|
|
}
|
|
|
|
|
|
print("Result:")
|
|
|
print(conference_json)
|
|
|
|
|
|
try:
|
|
|
# parse the result string into a json object to check correctness
|
|
|
conference_object = json.loads(conference_json)
|
|
|
|
|
|
result["success"] = True
|
|
|
result["json"] = conference_object
|
|
|
except:
|
|
|
print("> json is invalid, skipping")
|
|
|
|
|
|
result["success"] = False
|
|
|
|
|
|
json.dump(result, results_file)
|
|
|
results_file.write("\n")
|
|
|
results_file.flush()
|
|
|
|
|
|
|
|
|
results_file.close()
|