#!/usr/bin/env python3 from llama_cpp import Llama from bs4 import BeautifulSoup import requests import json OUTPUT_FILE = "conferences.json" LLM_EXAMPLE = ( "INPUT:\n" '

Statistical' " and Computational Aspects of Dynamics
Organized by Buddhima Kasun Fernando Akurugodage" " (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja" " Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi – SNS, Pisa. December 13" " – 16, 2022.

\n" "\n" "OUTPUT (JSON): \n" "{" '"title": "Statistical and Computational Aspects of Dynamics",' '"url": "http://www.crm.sns.it/event/507/", ' '"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica' " Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien," ' Austria). Centro De Giorgi - SNS, Pisa.", ' '"startDate": "2022-12-13", ' '"endDate": "2022-12-16"' "}\n" "\n" "INPUT:\n" ) def translate_to_json(conference_html: str) -> str: llm_answer = llm.create_chat_completion( max_tokens=None, messages=[ { "role": "system", "content": "You are an assistant aiding a software developer. Be precise in formatting the output correctly as requested", }, {"role": "user", "content": LLM_EXAMPLE}, {"role": "user", "content": conference_html}, {"role": "user", "content": "OUTPUT (JSON):"}, ], ) return llm_answer["choices"][0]["message"]["content"] def crawl_page(url): print(f"Crawling {url}") r = requests.get(url) if r.status_code == 200: html = r.text soup = BeautifulSoup(html, "html.parser") # Find the two elements h2 = soup.find("h2", class_="wp-block-heading") div = soup.find("div", class_="page-links") # Extract all the elements between h2_tag and div_tag if h2 and div: result = [] current = h2.find_next_sibling() while current and current != div: if current.name is not None and current.text.strip(): result.append(str(current)) current = current.find_next_sibling() print(f"Found {len(result)} conferences") return result else: raise Exception("Failed to find elements") else: raise Exception("Failed to fetch") baseurl = "https://www.dm.unipi.it/research/past-conferences/" page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)] conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)] print("LLM Example Context:") print(LLM_EXAMPLE) # Load the model and, set the chat format and use the default model context length llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0) # clear the result file open(OUTPUT_FILE, "w").close() # the result file is a sequence of json objects, one per line results_file = open(OUTPUT_FILE, "a") for conference_html in conference_html_snippets: print("Translating:") print(conference_html) conference_json = translate_to_json(conference_html) print("Result:") print(conference_json) try: # parse the result string into a json object to check correctness conference_object = json.loads(conference_json) json.dump(conference_object, results_file) results_file.write("\n") results_file.flush() except: print("> json is invalid, skipping") print(conference_json) results_file.close()