initial commit, reworked @BachoSeven's code

9 months ago · 8c6364277e
commit 8c6364277e
4 changed files with 191 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
+# Local files
+*.local*
+
+# Python
+venv/
+
+# Editors
+.vscode/
+
+# LLM Models
+*.gguf
--- a/README.md
+++ b/README.md
@ -0,0 +1,40 @@
+# Past Conferences Crawler (with LLM)
+
+A Python script that crawls conferences from <https://www.dm.unipi.it/research/past-conferences/> and processes them using a local-run LLM (we used [Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)) to translate the natural language info to a more structured format (json).
+
+## Installation
+
+Download the LLM model to use, specifically we use [`Mistral-7B-Instruct-v0.2-GGUF`](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF):
+
+```bash
+# download the model, ~4GB
+$ wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf
+```
+
+Install python the requirements:
+
+```bash
+# if you want create a venv
+$ python -m venv venv
+$ source venv/bin/activate
+
+# enable gpu support for llama-cpp
+$ export CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS"
+
+# install requirements
+$ pip install -r requirements.txt
+```
+
+## Launch
+
+The following command will crawl the conferences from `https://www.dm.unipi.it/research/past-conferences/` (pages 1 to 5) and save the results in `conferences.json`:
+
+```bash
+$ python main.py
+```
+
+The output is a list of json objects, one per line. To display the results with `jq`:
+
+```bash
+$ jq -s '.' conferences.json
+```
--- a/main.py
+++ b/main.py
@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+from llama_cpp import Llama
+import requests
+from bs4 import BeautifulSoup
+import textwrap
+import json
+
+LLM_EXAMPLE = (
+    "INPUT:\n"
+    '<p><a href="http://www.crm.sns.it/event/507/" target="_blank" rel="noreferrer noopener">Statistical'
+    " and Computational Aspects of Dynamics<br></a>Organized by Buddhima Kasun Fernando Akurugodage"
+    " (Centro di ricerca matematica Ennio De Giorgi &#8211; SNS), Paolo Giulietti, and Tanja"
+    " Isabelle Schindler (Universität Wien, Austria). Centro De Giorgi &#8211; SNS, Pisa. December 13"
+    " &#8211; 16, 2022.</p>\n"
+    "\n"
+    "OUTPUT (JSON): \n"
+    "{"
+    '"title": "Statistical and Computational Aspects of Dynamics",'
+    '"link": "http://www.crm.sns.it/event/507/", '
+    '"description": "Organized by Buddhima Kasun Fernando Akurugodage (Centro di ricerca matematica'
+    " Ennio De Giorgi – SNS), Paolo Giulietti, and Tanja Isabelle Schindler (Universität Wien,"
+    ' Austria). Centro De Giorgi", '
+    '"organizers": ['
+    '"Buddhima Kasun Fernando Akurugodage", '
+    '"Paolo Giulietti", '
+    '"Tanja Isabelle Schindler"'
+    "]"
+    '"location": "SNS, Pisa", '
+    '"date": "December 13 – 16, 2022.", '
+    '"start_date": "2022-12-13", '
+    '"end_date": "2022-12-16"'
+    "}\n"
+    "\n"
+    "INPUT:\n"
+)
+
+
+def translate_to_json(conference_html: str) -> str:
+    llm_answer = llm.create_chat_completion(
+        max_tokens=None,
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an assistant aiding a software developer. Be precise in formatting the output correctly as requested",
+            },
+            {"role": "user", "content": LLM_EXAMPLE},
+            {"role": "user", "content": conference_html},
+            {"role": "user", "content": "OUTPUT (JSON):"},
+        ],
+    )
+
+    return llm_answer["choices"][0]["message"]["content"]
+
+
+def crawl_page(url):
+    print(f"Crawling {url}")
+    r = requests.get(url)
+
+    if r.status_code == 200:
+        html = r.text
+        soup = BeautifulSoup(html, "html.parser")
+
+        # Find the two elements
+        h2 = soup.find("h2", class_="wp-block-heading")
+        div = soup.find("div", class_="page-links")
+
+        # Extract all the elements between h2_tag and div_tag
+        if h2 and div:
+            result = []
+
+            current = h2.find_next_sibling()
+            while current and current != div:
+                if current.name is not None and current.text.strip():
+                    result.append(str(current))
+                current = current.find_next_sibling()
+
+            print(f"Found {len(result)} conferences")
+            return result
+        else:
+            raise Exception("Failed to find elements")
+
+    else:
+        raise Exception("Failed to fetch")
+
+
+baseurl = "https://www.dm.unipi.it/research/past-conferences/"
+page_urls = [baseurl] + [baseurl + str(i) for i in range(2, 7)]
+
+conference_html_snippets = [snippet for link in page_urls for snippet in crawl_page(link)]
+
+print("LLM Example Context:")
+print(LLM_EXAMPLE)
+
+exit(1)
+
+# Load the model and, set the chat format and use the default model context length
+llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2", n_ctx=0)
+
+
+# clear the result file
+open("conferences.json", "w").close()
+
+# the result file is a sequence of json objects, one per line
+results_file = open("conferences.json", "a")
+
+for conference_html in conference_html_snippets:
+    print("Translating:")
+    print(conference_html)
+
+    conference_json = translate_to_json(conference_html)
+
+    print("Result:")
+    print(conference_json)
+
+    try:
+        # parse the result string into a json object to check correctness
+        conference_object = json.loads(conference_json)
+        json.dump(conference_object, results_file)
+        results_file.write("\n")
+        results_file.flush()
+    except:
+        print("> json is invalid, skipping")
+        print(conference_json)
+
+results_file.close()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
+beautifulsoup4==4.12.3
+bs4==0.0.2
+certifi==2023.11.17
+charset-normalizer==3.3.2
+diskcache==5.6.3
+idna==3.6
+Jinja2==3.1.3
+llama_cpp_python==0.2.36
+MarkupSafe==2.1.4
+numpy==1.26.3
+requests==2.31.0
+soupsieve==2.5
+typing_extensions==4.9.0
+urllib3==2.1.0