#!/usr/bin/env python3 from llama_cpp import Llama from bs4 import BeautifulSoup import requests import json from datetime import datetime current_date = datetime.now().strftime("%Y-%m-%d_%H-%M") OUTPUT_FILE = f"results_{current_date}.json" EXAMPLES = [ { "input": r"""

Statistical and Computational Aspects of Dynamics
Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13 – 16, 2022.

""", "output": json.dumps({ "title": "Statistical and Computational Aspects of Dynamics", "url": "http://www.crm.sns.it/event/507/", "description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).", "location": "Centro De Giorgi – SNS, Pisa", "startDate": "2022-12-13", "endDate": "2022-12-16" }), }, { "input": r"""

Workshop on Variational problems, PDEs and applications
Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 – 18, 2020.

""", "output": json.dumps({ "title": "Workshop on Variational problems, PDEs and applications", "url": "http://pagine.dm.unipi.it/berselli/meeting2020/", "description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).", "location": "Department of Mathematics, Pisa", "startDate": "2020-01-17", "endDate": "2020-01-18", }), }, { "input": r"""

Geometric Representation Theory. ICM Satellite Conference
Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 – July 2, 2022.

""", "output": json.dumps({ "title": "Geometric Representation Theory. ICM Satellite Conference", "url": None, "description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).", "location": "Online", "startDate": "2022-06-27", "endDate": "2022-07-02", }), }, { "input": r"""

Incontri di geometria algebrica ed aritmetica Milano – Pisa
Department of Mathematics, Pisa. November 16 – 17, 2022.

""", "output": json.dumps({ "title": "Incontri di geometria algebrica ed aritmetica Milano – Pisa", "url": "https://events.dm.unipi.it/event/109/", "description": "", "location": "Department of Mathematics, Pisa", "startDate": "2022-11-16", "endDate": "2022-11-17" }) }, { "input": r"""

A journey in numerical linear algebra: a workshop in honor of Michele Benzi’s 60th birthday
Organized by Francesca Arrigo (University of Strathclyde), Paola Boito, Christine Klymko (Lawrence Livermore National Laboratory), Beatrice Meini, and Leonardo Robol. Department of Mathematics, Pisa. June 10 – 11, 2022.

""", "output": json.dumps({ "title": "A journey in numerical linear algebra: a workshop in honor of Michele Benzi’s 60th birthday", "url": "https://events.dm.unipi.it/event/75/", "description": "Organized by Francesca Arrigo (University of Strathclyde), Paola Boito, Christine Klymko (Lawrence Livermore National Laboratory), Beatrice Meini, and Leonardo Robol.", "location": "Department of Mathematics, Pisa", "startDate": "2022-06-10", "endDate": "2022-06-11" }) } ] def translate_to_json(input_html: str) -> str: llm_answer = llm.create_chat_completion( max_tokens=None, messages=[ { "role": "system", "content": "You are an assistant helping a developer converting raw text data to JSON. Output only valid compact inline JSON following the given examples, without including any additional notes or comments", }, ] + list( map(lambda example: ({ "role": "user", "content": f"""INPUT:\n{example["input"]}\n\nOUTPUT JSON:\n{example["output"]}""" }), EXAMPLES) ) + [ { "role": "user", "content": f"INPUT:\n{input_html}\n\nOUTPUT JSON:\n" }, ], ) return llm_answer["choices"][0]["message"]["content"] def crawl_page(url): print(f"Crawling {url}") r = requests.get(url) if r.status_code == 200: html = r.text soup = BeautifulSoup(html, "html.parser") # Find the two elements h2 = soup.find("h2", class_="wp-block-heading") div = soup.find("div", class_="page-links") # Extract all the elements between h2_tag and div_tag if h2 and div: result = [] current = h2.find_next_sibling() while current and current != div: if current.name is not None and current.text.strip(): result.append(str(current)) current = current.find_next_sibling() print(f"Found {len(result)} conferences") return result else: raise Exception("Failed to find elements") else: raise Exception("Failed to fetch") baseurl = "https://www.dm.unipi.it/research/past-conferences/" page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)] conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] # Load the model and, set the chat format and use the default model context length llm = Llama( model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", verbose=False, n_ctx=0, ) # the result file is a sequence of json objects, one per line results_file = open(OUTPUT_FILE, "w") for conference_html in conference_html_snippets: print("--------------------------------------------------") print("Translating:") print(conference_html) conference_json = translate_to_json(conference_html) result = { "input_html": conference_html, "raw_output": conference_json, # "json": None, # "success": False, } print("Result:") print(conference_json) try: # parse the result string into a json object to check correctness conference_object = json.loads(conference_json) result["success"] = True result["json"] = conference_object except: print("> json is invalid, skipping") result["success"] = False json.dump(result, results_file) results_file.write("\n") results_file.flush() results_file.close()