commit 33bdcb16bf359771c0b39ed9dbeadeefdd2594af Author: DaLynX Date: Fri Aug 30 20:29:12 2024 +0900 Initial commit: scrape to sqlite diff --git a/tlfi.py b/tlfi.py new file mode 100644 index 0000000..aaf76bb --- /dev/null +++ b/tlfi.py @@ -0,0 +1,76 @@ +import sqlite3 +import json +import sys +from tqdm import tqdm +from requests_html import HTMLSession +session = HTMLSession() + +con = sqlite3.connect('tlfi.db') +cur = con.cursor() + +if len(sys.argv) > 1: + limit = int(sys.argv[1]) +else: + limit = 1 +print("Limit:", limit) + +def get_pages(word, link): + r = session.get(link) + nb_entries = len(r.html.find('#vtoolbar', first=True).find('a')) + tqdm.write(f"Word '{word}' has {nb_entries} entries.") + page = r.html.find('#lexicontent', first=True) + yield page.html + for entry in range(1, nb_entries): + r = session.get(f"{link}/{entry}") + page = r.html.find('#lexicontent', first=True) + yield page.html + +def get_words(link): + i = 0 + while True: + r = session.get(f"{link}/{i}") + words = r.html.find('.hometab', first=True) + words_dic = { link.rsplit('/')[-1]: link for link in words.absolute_links } + yield words_dic + if "Page suivante" in r.html.html: + i += 80 + else: + break + +def get_letters_and_words(): + r = session.get('https://www.cnrtl.fr/definition/') + letters = r.html.find('.letterHeader', first=True) + letters_dic = { link[-1]: link for link in letters.absolute_links } + + for letter, letter_link in sorted(letters_dic.items()): + print(f"Processing letter {letter}") + words = {} + for batch in get_words(letter_link): + words.update(batch) + print('.', end='', flush=True) + print('') + nb = len(words) + print(f"Found {nb} words for letter {letter}.") + cur.executemany("INSERT INTO word_links VALUES(?, ?)", words.items()) + con.commit() + +res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_links'") +if res.fetchone() is None: + cur.execute("CREATE TABLE word_links(word, link)") + get_letters_and_words() +else: + res = cur.execute("SELECT name FROM sqlite_master WHERE name='word_pages'") + if res.fetchone() is None: + cur.execute("CREATE TABLE word_pages(word, page)") + for word, link in tqdm(cur.execute(f"SELECT word, link FROM word_links LEFT JOIN word_pages USING (word) WHERE page IS NULL ORDER BY word LIMIT ?", (limit,)).fetchall()): + tqdm.write(f"Looking up word '{word}'") + try: + pages = list(get_pages(word, link)) + j = json.dumps(pages, indent=4) + except: + j = "ERROR" + tqdm.write(f"ERROR on word '{word}'") + cur.execute("INSERT INTO word_pages VALUES(?, ?)", (word, j)) + con.commit() + +con.close()