inital commit
This commit is contained in:
64
src/db.py
Normal file
64
src/db.py
Normal file
@@ -0,0 +1,64 @@
|
||||
from sqlalchemy import create_engine, Column, Integer, String, \
|
||||
ForeignKey, UniqueConstraint
|
||||
from sqlalchemy.engine import URL
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
import logging
|
||||
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Ingredient(Base):
|
||||
__tablename__ = 'Ingredient'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
name = Column(String, nullable = False)
|
||||
|
||||
class RecipeSite(Base):
|
||||
__tablename__ = 'RecipeSite'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
name = Column(String, nullable = False, unique = True)
|
||||
ingredient_class = Column(String, nullable = False)
|
||||
name_class = Column(String, nullable = False)
|
||||
base_url = Column(String, nullable = False, unique = True)
|
||||
|
||||
class Recipe(Base):
|
||||
__tablename__ = 'Recipe'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
name = Column(String)
|
||||
identifier = Column(String, nullable = False)
|
||||
recipe_site_id = Column(Integer, ForeignKey('RecipeSite.id'))
|
||||
UniqueConstraint(identifier, recipe_site_id)
|
||||
|
||||
class RecipeIngredient(Base):
|
||||
__tablename__ = 'RecipeIngredient'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
text = Column(String, nullable = False)
|
||||
recipe_id = Column(Integer, ForeignKey('Recipe.id'))
|
||||
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
|
||||
|
||||
|
||||
def get_engine(use_dotenv = True, **kargs):
|
||||
if use_dotenv:
|
||||
load_dotenv()
|
||||
DB_URL = os.getenv("POSTGRES_URL")
|
||||
DB_USER = os.getenv("POSTGRES_USER")
|
||||
DB_PASSWORD = os.getenv("POSTGRES_PASSWORD")
|
||||
DB_NAME = os.getenv("POSTGRES_DB")
|
||||
|
||||
eng_url = URL.create('postgresql',
|
||||
username=DB_USER,
|
||||
password=DB_PASSWORD,
|
||||
host=DB_URL,
|
||||
database=DB_NAME)
|
||||
return create_engine(eng_url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
eng = get_engine()
|
||||
logging.info(f"Createing DB Tables: {eng.url}")
|
||||
Base.metadata.create_all(eng, checkfirst=True)
|
||||
13
src/func.sql
Normal file
13
src/func.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
DROP FUNCTION IF EXISTS cos_sim;
|
||||
CREATE FUNCTION cos_sim(a TEXT, b TEXT)
|
||||
returns REAL
|
||||
AS $$
|
||||
from sentence_transformers import CrossEncoder, util
|
||||
model_name = "cross-encoder/stsb-roberta-large"
|
||||
|
||||
if not SD.get(model_name):
|
||||
SD[model_name] = CrossEncoder(model_name)
|
||||
model = SD[model_name]
|
||||
|
||||
return model.predict([(a, b)])[0]
|
||||
$$ LANGUAGE plpython3u;
|
||||
29
src/insert_sites.py
Normal file
29
src/insert_sites.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import db
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
parser = argparse.ArgumentParser(description='Import recipes into database')
|
||||
parser.add_argument('file', type=str,
|
||||
help='JSON file with recipe site information')
|
||||
parser.add_argument('-v', '--verbose', action='store_true')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
||||
|
||||
with open(args.file) as f:
|
||||
sites = json.load(f)
|
||||
|
||||
eng = db.get_engine()
|
||||
S = sessionmaker(eng)
|
||||
|
||||
with S.begin() as session:
|
||||
for site in sites:
|
||||
logging.info(f"Adding {site}")
|
||||
session.add(db.RecipeSite(**site))
|
||||
|
||||
|
||||
42
src/scrape.py
Normal file
42
src/scrape.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import db
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import bs4
|
||||
from urllib.request import urlopen
|
||||
import logging
|
||||
from argparse import ArgumentParser
|
||||
|
||||
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
||||
parser.add_argument('site',
|
||||
help='Name of site')
|
||||
parser.add_argument('identifier',
|
||||
help='url of recipe(reletive to base url of site)')
|
||||
parser.add_argument('-v', '--verbose', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
||||
|
||||
eng = db.get_engine()
|
||||
S = sessionmaker(eng)
|
||||
|
||||
with S.begin() as sess:
|
||||
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == 'AllRecipe').one()
|
||||
|
||||
recipe = db.Recipe(identifier = args.identifier, recipe_site_id = site.id)
|
||||
with urlopen(site.base_url + recipe.identifier) as f:
|
||||
recipe_page = bs4.BeautifulSoup(f.read().decode())
|
||||
|
||||
name_div = recipe_page.find_all(class_=site.name_class)[0]
|
||||
recipe.name = name_div.text
|
||||
sess.add(recipe)
|
||||
sess.flush()
|
||||
logging.info(f"Adding Recipe {recipe}")
|
||||
|
||||
ingredients = []
|
||||
for ingredient in recipe_page.find_all(class_=site.ingredient_class):
|
||||
ingredients.append(db.RecipeIngredient(text=ingredient.text,
|
||||
recipe_id=recipe.id))
|
||||
logging.info(f"{len(ingredients)} ingredients found. Inserting into DB")
|
||||
sess.add_all(ingredients)
|
||||
20
src/triggers.sql
Normal file
20
src/triggers.sql
Normal file
@@ -0,0 +1,20 @@
|
||||
CREATE OR REPLACE FUNCTION recipe_ingredient_update()
|
||||
RETURNS TRIGGER
|
||||
AS
|
||||
$$
|
||||
BEGIN
|
||||
WITH I AS (
|
||||
SELECT "Ingredient".id, cos_sim(NEW.text, "Ingredient".name) as sim
|
||||
FROM "Ingredient"
|
||||
WHERE regexp_split_to_array(NEW.text, E'\\s+') && regexp_split_to_array("Ingredient".name, E'\\s+')
|
||||
ORDER BY sim DESC
|
||||
)
|
||||
SELECT I.id INTO NEW.ingredient_id from I LIMIT 1;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE TRIGGER match_ingredient
|
||||
BEFORE INSERT ON "RecipeIngredient"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION recipe_ingredient_update();
|
||||
Reference in New Issue
Block a user