conferences-llm-crawler/main.py

#!/usr/bin/env python3

from llama_cpp import Llama
from bs4 import BeautifulSoup
import requests
import json


OUTPUT_FILE = "results.json"


HTML_EXAMPLE_1 = r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13 &#8211; 16, 2022.</p>"""


OUTPUT_EXAMPLE_1 = json.dumps({ 
    "title": "Statistical and Computational Aspects of Dynamics",
    "url": "http://www.crm.sns.it/event/507/",
    "description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).\n\nLocation: Centro De Giorgi – SNS, Pisa",
    "startDate": "2022-12-13",
    "endDate": "2022-12-16"
})


HTML_EXAMPLE_2 = r"""<p><em><a href="http://pagine.dm.unipi.it/berselli/meeting2020/" rel="noreferrer noopener" target="_blank">Workshop on Variational problems, PDEs and applications<br/></a></em>Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 – 18, 2020.</p>"""


OUTPUT_EXAMPLE_2 = json.dumps({
    "title": "Workshop on Variational problems, PDEs and applications",
    "url": "http://pagine.dm.unipi.it/berselli/meeting2020/",
    "description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).\n\nLocation: Department of Mathematics, Pisa",
    "startDate": "2020-01-17",
    "endDate": "2020-01-18",
})


HTML_EXAMPLE_3 = r"""<p>Geometric Representation Theory. ICM Satellite Conference<br/>Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 – July 2,  2022.</p>"""


OUTPUT_EXAMPLE_3 = json.dumps({
    "title": "Geometric Representation Theory. ICM Satellite Conference",
    "url": None,
    "description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).\n\nLocation: Online",
    "startDate": "2022-06-27",
    "endDate": "2022-07-02",
})


def translate_to_json(conference_html: str) -> str:
    llm_answer = llm.create_chat_completion(
        max_tokens=None,
        messages=[
            {
                "role": "system",
                "content": "You are an assistant helping a developer converting raw text data to JSON. Be precise in formatting the output and only output valid JSON using the specificied fields, without including any additional fields or comments",
            },
            # Example 1
            { "role": "user", "content": "INPUT:" },
            { "role": "user", "content": HTML_EXAMPLE_1 },
            { "role": "user", "content": "OUTPUT JSON:" },
            { "role": "user", "content": OUTPUT_EXAMPLE_1 },
            # Example 2
            { "role": "user", "content": "INPUT:" },
            { "role": "user", "content": HTML_EXAMPLE_2 },
            { "role": "user", "content": "OUTPUT JSON:" },
            { "role": "user", "content": OUTPUT_EXAMPLE_2 },
            # Example 3
            { "role": "user", "content": "INPUT:" },
            { "role": "user", "content": HTML_EXAMPLE_3 },
            { "role": "user", "content": "OUTPUT JSON:" },
            { "role": "user", "content": OUTPUT_EXAMPLE_3 },
            # Actual item to process
            { "role": "user", "content": "INPUT:" },
            { "role": "user", "content": conference_html },
            { "role": "user", "content": "OUTPUT JSON:" },
        ],
    )

    return llm_answer["choices"][0]["message"]["content"]


def crawl_page(url):
    print(f"Crawling {url}")
    r = requests.get(url)

    if r.status_code == 200:
        html = r.text
        soup = BeautifulSoup(html, "html.parser")

        # Find the two elements
        h2 = soup.find("h2", class_="wp-block-heading")
        div = soup.find("div", class_="page-links")

        # Extract all the elements between h2_tag and div_tag
        if h2 and div:
            result = []

            current = h2.find_next_sibling()
            while current and current != div:
                if current.name is not None and current.text.strip():
                    result.append(str(current))
                current = current.find_next_sibling()

            print(f"Found {len(result)} conferences")
            return result
        else:
            raise Exception("Failed to find elements")

    else:
        raise Exception("Failed to fetch")


baseurl = "https://www.dm.unipi.it/research/past-conferences/"
page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]

conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]

# Load the model and, set the chat format and use the default model context length
llm = Llama(
    model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", 
    chat_format="llama-2", 
    verbose=False,
    n_ctx=0,
)

# the result file is a sequence of json objects, one per line
results_file = open(OUTPUT_FILE, "w")

for conference_html in conference_html_snippets:
    print("--------------------------------------------------")
    print("Translating:")
    print(conference_html)

    conference_json = translate_to_json(conference_html)

    result = {
        "input_html": conference_html,
        "raw_output": conference_json,
        # "json": None,
        # "success": False,
    }

    print("Result:")
    print(conference_json)

    try:
        # parse the result string into a json object to check correctness
        conference_object = json.loads(conference_json)

        result["success"] = True
        result["json"] = conference_object
    except:
        print("> json is invalid, skipping")

        result["success"] = False

    json.dump(result, results_file)
    results_file.write("\n")
    results_file.flush()


results_file.close()
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								#!/usr/bin/env python3
 								from llama_cpp import Llama
 								from bs4 import BeautifulSoup
-												fix: unused import

											
										
										
											9 months ago
+								import requests
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								import json
-												chore: updated readme, minor changes to the code

											
										
										
											9 months ago
-												feat: better result file structure

											
										
										
											9 months ago
+								OUTPUT_FILE = "results.json"
-												chore: minor enhancements

											
										
										
											9 months ago
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
-												fix: added another example

											
										
										
											9 months ago
+								HTML_EXAMPLE_1 = r"""<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13 &#8211; 16, 2022.</p>"""
-												chore: updated readme, minor changes to the code

											
										
										
											9 months ago
-												fix: added another example

											
										
										
											9 months ago
+								OUTPUT_EXAMPLE_1 = json.dumps({
-												chore: updated readme, minor changes to the code

											
										
										
											9 months ago
+								    "title": "Statistical and Computational Aspects of Dynamics",
 								    "url": "http://www.crm.sns.it/event/507/",
-												fix: added another example without link

											
										
										
											9 months ago
+								    "description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien, Austria).\n\nLocation: Centro De Giorgi – SNS, Pisa",
-												chore: updated readme, minor changes to the code

											
										
										
											9 months ago
+								    "startDate": "2022-12-13",
 								    "endDate": "2022-12-16"
 								})
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
-												fix: added another example

											
										
										
											9 months ago
+								HTML_EXAMPLE_2 = r"""<p><em><a href="http://pagine.dm.unipi.it/berselli/meeting2020/" rel="noreferrer noopener" target="_blank">Workshop on Variational problems, PDEs and applications<br/></a></em>Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa). Department of Mathematics, Pisa. January 17 – 18, 2020.</p>"""
 								OUTPUT_EXAMPLE_2 = json.dumps({
 								    "title": "Workshop on Variational problems, PDEs and applications",
 								    "url": "http://pagine.dm.unipi.it/berselli/meeting2020/",
-												fix: added another example without link

											
										
										
											9 months ago
+								    "description": "Organized by Luigi Berselli, Giuseppe Buttazzo, Matteo Novaga, and Andrea Malchiodi (Scuola Normale Superiore, Pisa).\n\nLocation: Department of Mathematics, Pisa",
-												fix: added another example

											
										
										
											9 months ago
+								    "startDate": "2020-01-17",
 								    "endDate": "2020-01-18",
 								})
-												fix: added another example without link

											
										
										
											9 months ago
+								HTML_EXAMPLE_3 = r"""<p>Geometric Representation Theory. ICM Satellite Conference<br/>Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA). Online. June 27 – July 2,  2022.</p>"""
 								OUTPUT_EXAMPLE_3 = json.dumps({
 								    "title": "Geometric Representation Theory. ICM Satellite Conference",
 								    "url": None,
 								    "description": "Organized by Tomoyuki Arakawa (RIMS, Kyoto, Japan), Joel Kamnitzer (University of Toronto, Japan), Hiraku Nakajima (Kavli IPMU, Japan), Markus Reineke (Ruhr-Universität Bochum), Francesco Sala, and Vera Serganova (University of California Berkeley, USA).\n\nLocation: Online",
 								    "startDate": "2022-06-27",
 								    "endDate": "2022-07-02",
 								})
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								def translate_to_json(conference_html: str) -> str:
 								    llm_answer = llm.create_chat_completion(
 								        max_tokens=None,
 								        messages=[
 								            {
 								                "role": "system",
-												fix: updated system prompt

											
										
										
											9 months ago
+								                "content": "You are an assistant helping a developer converting raw text data to JSON. Be precise in formatting the output and only output valid JSON using the specificied fields, without including any additional fields or comments",
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								            },
-												fix: added another example without link

											
										
										
											9 months ago
+								            # Example 1
 								            { "role": "user", "content": "INPUT:" },
 								            { "role": "user", "content": HTML_EXAMPLE_1 },
 								            { "role": "user", "content": "OUTPUT JSON:" },
 								            { "role": "user", "content": OUTPUT_EXAMPLE_1 },
 								            # Example 2
 								            { "role": "user", "content": "INPUT:" },
 								            { "role": "user", "content": HTML_EXAMPLE_2 },
 								            { "role": "user", "content": "OUTPUT JSON:" },
 								            { "role": "user", "content": OUTPUT_EXAMPLE_2 },
 								            # Example 3
 								            { "role": "user", "content": "INPUT:" },
 								            { "role": "user", "content": HTML_EXAMPLE_3 },
 								            { "role": "user", "content": "OUTPUT JSON:" },
 								            { "role": "user", "content": OUTPUT_EXAMPLE_3 },
 								            # Actual item to process
 								            { "role": "user", "content": "INPUT:" },
 								            { "role": "user", "content": conference_html },
 								            { "role": "user", "content": "OUTPUT JSON:" },
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								        ],
 								    )
 								    return llm_answer["choices"][0]["message"]["content"]
 								def crawl_page(url):
 								    print(f"Crawling {url}")
 								    r = requests.get(url)
 								    if r.status_code == 200:
 								        html = r.text
 								        soup = BeautifulSoup(html, "html.parser")
 								        # Find the two elements
 								        h2 = soup.find("h2", class_="wp-block-heading")
 								        div = soup.find("div", class_="page-links")
 								        # Extract all the elements between h2_tag and div_tag
 								        if h2 and div:
 								            result = []
 								            current = h2.find_next_sibling()
 								            while current and current != div:
 								                if current.name is not None and current.text.strip():
 								                    result.append(str(current))
 								                current = current.find_next_sibling()
 								            print(f"Found {len(result)} conferences")
 								            return result
 								        else:
 								            raise Exception("Failed to find elements")
 								    else:
 								        raise Exception("Failed to fetch")
 								baseurl = "https://www.dm.unipi.it/research/past-conferences/"
-												fix: boh

											
										
										
											9 months ago
+								page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 8)]
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
 								conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
 								# Load the model and, set the chat format and use the default model context length
-												fix: boh

											
										
										
											9 months ago
+								llm = Llama(
 								    model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf",
 								    chat_format="llama-2",
 								    verbose=False,
 								    n_ctx=0,
 								)
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
 								# the result file is a sequence of json objects, one per line
-												fix: added some logging

											
										
										
											9 months ago
+								results_file = open(OUTPUT_FILE, "w")
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
 								for conference_html in conference_html_snippets:
-												Fix example

											
										
										
											9 months ago
+								    print("--------------------------------------------------")
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								    print("Translating:")
 								    print(conference_html)
 								    conference_json = translate_to_json(conference_html)
-												feat: better result file structure

											
										
										
											9 months ago
+								    result = {
 								        "input_html": conference_html,
 								        "raw_output": conference_json,
 								        # "json": None,
 								        # "success": False,
 								    }
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								    print("Result:")
 								    print(conference_json)
 								    try:
 								        # parse the result string into a json object to check correctness
 								        conference_object = json.loads(conference_json)
-												feat: better result file structure

											
										
										
											9 months ago
 								        result["success"] = True
 								        result["json"] = conference_object
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
+								    except:
 								        print("> json is invalid, skipping")
-												fix: boh

											
										
										
											9 months ago
-												feat: better result file structure

											
										
										
											9 months ago
+								        result["success"] = False
 								    json.dump(result, results_file)
 								    results_file.write("\n")
 								    results_file.flush()
-												initial commit, reworked @BachoSeven's code

											
										
										
											9 months ago
 								results_file.close()