#!/usr/bin/env python3 from llama_cpp import Llama from bs4 import BeautifulSoup import requests import json from datetime import datetime current_date = datetime.now().strftime("%Y-%m-%d_%H-%M") OUTPUT_FILE = f"results_{current_date}.json" EXAMPLES = [ { "input": r"""

Statistical and Computational Aspects of Dynamics
Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13 – 16, 2022.

""", "output": json.dumps({ "title": "Statistical and Computational Aspects of Dynamics", "url": "http://www.crm.sns.it/event/507/", "description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).\n\nLocation: Centro De Giorgi – SNS, Pisa", "startDate": "2022-12-13", "endDate": "2022-12-16" }), }, { "input": r"""

Workshop on Variational problems, PDEs and applications
Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 – 18, 2020.

""", "output": json.dumps({ "title": "Workshop on Variational problems, PDEs and applications", "url": "http://pagine.dm.unipi.it/berselli/meeting2020/", "description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).\n\nLocation: Department of Mathematics, Pisa", "startDate": "2020-01-17", "endDate": "2020-01-18", }), }, { "input": r"""

Geometric Representation Theory. ICM Satellite Conference
Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 – July 2, 2022.

""", "output": json.dumps({ "title": "Geometric Representation Theory. ICM Satellite Conference", "url": None, "description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).\n\nLocation: Online", "startDate": "2022-06-27", "endDate": "2022-07-02", }), }, { "input": r"""

Incontri di geometria algebrica ed aritmetica Milano – Pisa
Department of Mathematics, Pisa. November 16 – 17, 2022.

""", "output": json.dumps({ "title": "Incontri di geometria algebrica ed aritmetica Milano – Pisa", "url": "https://events.dm.unipi.it/event/109/", "description": "Location: Department of Mathematics, Pisa", "startDate": "2022-11-16", "endDate": "2022-11-17" }) } ] def translate_to_json(input_html: str) -> str: llm_answer = llm.create_chat_completion( max_tokens=None, messages=[ { "role": "system", "content": "You are an assistant helping a developer converting raw text data to JSON. Output only valid JSON following the given examples, without including any additional notes or comments", }, ] + list( map(lambda example: ({ "role": "user", "content": f"""INPUT:\n{example["input"]}\n\nOUTPUT JSON:\n{example["output"]}""" }), EXAMPLES) ) + [ { "role": "user", "content": f"INPUT:\n{input_html}\n\nOUTPUT JSON:\n" }, ], ) return llm_answer["choices"][0]["message"]["content"] def crawl_page(url): print(f"Crawling {url}") r = requests.get(url) if r.status_code == 200: html = r.text soup = BeautifulSoup(html, "html.parser") # Find the two elements h2 = soup.find("h2", class_="wp-block-heading") div = soup.find("div", class_="page-links") # Extract all the elements between h2_tag and div_tag if h2 and div: result = [] current = h2.find_next_sibling() while current and current != div: if current.name is not None and current.text.strip(): result.append(str(current)) current = current.find_next_sibling() print(f"Found {len(result)} conferences") return result else: raise Exception("Failed to find elements") else: raise Exception("Failed to fetch") baseurl = "https://www.dm.unipi.it/research/past-conferences/" page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)] conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] # Load the model and, set the chat format and use the default model context length llm = Llama( model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", verbose=False, n_ctx=0, ) # the result file is a sequence of json objects, one per line results_file = open(OUTPUT_FILE, "w") for conference_html in conference_html_snippets: print("--------------------------------------------------") print("Translating:") print(conference_html) conference_json = translate_to_json(conference_html) result = { "input_html": conference_html, "raw_output": conference_json, # "json": None, # "success": False, } print("Result:") print(conference_json) try: # parse the result string into a json object to check correctness conference_object = json.loads(conference_json) result["success"] = True result["json"] = conference_object except: print("> json is invalid, skipping") result["success"] = False json.dump(result, results_file) results_file.write("\n") results_file.flush() results_file.close()