conferences-llm-crawler/main.py

#!/usr/bin/env python3

from llama_cpp import Llama
from bs4 import BeautifulSoup
import requests
import json

OUTPUT_FILE = "conferences.json"

LLM_EXAMPLE = (
    "INPUT:\n"
    '<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical'
    " and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage"
    " (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja"
    " Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13"
    " &#8211; 16, 2022.</p>\n"
    "\n"
    "OUTPUT (JSON): \n"
    "{"
    '"title": "Statistical and Computational Aspects of Dynamics",'
    '"url": "http://www.crm.sns.it/event/507/", '
    '"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica'
    " Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien,"
    ' Austria). Centro De Giorgi - SNS, Pisa.", '
    '"startDate": "2022-12-13", '
    '"endDate": "2022-12-16"'
    "}\n"
    "\n"
    "INPUT:\n"
)


def translate_to_json(conference_html: str) -> str:
    llm_answer = llm.create_chat_completion(
        max_tokens=None,
        messages=[
            {
                "role": "system",
                "content": "You are an assistant aiding a software developer. Be precise in formatting the output correctly as requested",
            },
            {"role": "user", "content": LLM_EXAMPLE},
            {"role": "user", "content": conference_html},
            {"role": "user", "content": "OUTPUT (JSON):"},
        ],
    )

    return llm_answer["choices"][0]["message"]["content"]


def crawl_page(url):
    print(f"Crawling {url}")
    r = requests.get(url)

    if r.status_code == 200:
        html = r.text
        soup = BeautifulSoup(html, "html.parser")

        # Find the two elements
        h2 = soup.find("h2", class_="wp-block-heading")
        div = soup.find("div", class_="page-links")

        # Extract all the elements between h2_tag and div_tag
        if h2 and div:
            result = []

            current = h2.find_next_sibling()
            while current and current != div:
                if current.name is not None and current.text.strip():
                    result.append(str(current))
                current = current.find_next_sibling()

            print(f"Found {len(result)} conferences")
            return result
        else:
            raise Exception("Failed to find elements")

    else:
        raise Exception("Failed to fetch")


baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)]

conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]

print("LLM Example Context:")
print(LLM_EXAMPLE)

# Load the model and, set the chat format and use the default model context length
llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0)


# clear the result file
open(OUTPUT_FILE, "w").close()

# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "a")

for conference_html in conference_html_snippets:
    print("Translating:")
    print(conference_html)

    conference_json = translate_to_json(conference_html)

    print("Result:")
    print(conference_json)

    try:
        # parse the result string into a json object to check correctness
        conference_object = json.loads(conference_json)
        json.dump(conference_object, results_file)
        results_file.write("\n")
        results_file.flush()
    except:
        print("> json is invalid, skipping")
        print(conference_json)

results_file.close()