moved scripts to module
This commit is contained in:
219
src/db.py
219
src/db.py
@@ -1,219 +0,0 @@
|
||||
import os
|
||||
import logging
|
||||
from types import NoneType
|
||||
from dotenv import load_dotenv
|
||||
from xmlrpc.client import Boolean
|
||||
from sqlalchemy import create_engine, Column, Integer, String, Boolean, \
|
||||
ForeignKey, UniqueConstraint, func, select, and_, or_, \
|
||||
not_
|
||||
from sqlalchemy.types import ARRAY
|
||||
from sqlalchemy.engine import URL
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
class Ingredient(Base):
|
||||
__tablename__ = 'Ingredient'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
name = Column(String, nullable = False)
|
||||
|
||||
class RecipeSite(Base):
|
||||
__tablename__ = 'RecipeSite'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
name = Column(String, nullable = False, unique = True)
|
||||
ingredient_class = Column(String, nullable = False)
|
||||
name_class = Column(String, nullable = False)
|
||||
base_url = Column(String, nullable = False, unique = True)
|
||||
|
||||
class Recipe(Base):
|
||||
__tablename__ = 'Recipe'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
name = Column(String)
|
||||
identifier = Column(String, nullable = False)
|
||||
recipe_site_id = Column(Integer, ForeignKey('RecipeSite.id'))
|
||||
UniqueConstraint(identifier, recipe_site_id)
|
||||
|
||||
class RecipeIngredient(Base):
|
||||
__tablename__ = 'RecipeIngredient'
|
||||
|
||||
id = Column(Integer, primary_key = True)
|
||||
text = Column(String, nullable = False)
|
||||
recipe_id = Column(Integer, ForeignKey('Recipe.id'))
|
||||
ingredient_id = Column(Integer, ForeignKey("Ingredient.id"))
|
||||
|
||||
class RecipeIngredientParts(Base):
|
||||
__tablename__ = 'RecipeIngredientParts'
|
||||
|
||||
id = Column(Integer, ForeignKey("RecipeIngredient.id"), primary_key=True)
|
||||
quantity = Column(String)
|
||||
unit = Column(String)
|
||||
instruction = Column(String)
|
||||
ingredient = Column(String)
|
||||
supplement = Column(String)
|
||||
|
||||
class IngredientConnection(Base):
|
||||
__tablename__ = 'IngredientConnection'
|
||||
|
||||
ingredient_a = Column(String,
|
||||
ForeignKey("RecipeIngredientParts.ingredient"),
|
||||
primary_key = True)
|
||||
ingredient_b = Column(String,
|
||||
ForeignKey("RecipeIngredientParts.ingredient"),
|
||||
primary_key = True)
|
||||
recipe_count = Column(Integer)
|
||||
UniqueConstraint(ingredient_a, ingredient_b)
|
||||
|
||||
class RecipeConnection(Base):
|
||||
__tablename__ = 'RecipeConnection'
|
||||
|
||||
recipe_a = Column(Integer,
|
||||
ForeignKey("Recipe.id"),
|
||||
primary_key = True)
|
||||
recipe_b = Column(Integer,
|
||||
ForeignKey("Recipe.id"),
|
||||
primary_key = True)
|
||||
ingredient_count = Column(Integer)
|
||||
|
||||
class RecipeGraphed(Base):
|
||||
__tablename__ = "RecipeGraphed"
|
||||
|
||||
recipe_id = Column(Integer, ForeignKey("Recipe.id"), primary_key = True)
|
||||
status = Column(Boolean, nullable = False, default = False)
|
||||
|
||||
|
||||
def get_engine(use_dotenv = True, **kargs):
|
||||
if use_dotenv:
|
||||
load_dotenv()
|
||||
DB_URL = os.getenv("POSTGRES_URL")
|
||||
DB_USER = os.getenv("POSTGRES_USER")
|
||||
DB_PASSWORD = os.getenv("POSTGRES_PASSWORD")
|
||||
DB_NAME = os.getenv("POSTGRES_DB")
|
||||
|
||||
eng_url = URL.create('postgresql',
|
||||
username=DB_USER,
|
||||
password=DB_PASSWORD,
|
||||
host=DB_URL,
|
||||
database=DB_NAME)
|
||||
return create_engine(eng_url)
|
||||
|
||||
|
||||
def create_tables(eng):
|
||||
logging.info(f"Createing DB Tables: {eng.url}")
|
||||
Base.metadata.create_all(eng, checkfirst=True)
|
||||
|
||||
def pair_query(pairable, groupable, recipe_ids = None, pair_type = String):
|
||||
pair_func= func.text_pairs
|
||||
if pair_type == Integer:
|
||||
pair_func=func.int_pairs
|
||||
|
||||
new_pairs = select(groupable,
|
||||
pair_func(func.array_agg(pairable.distinct()),
|
||||
type_=ARRAY(pair_type)).label("pair"))\
|
||||
.join(RecipeIngredientParts)
|
||||
|
||||
if not type(recipe_ids) == NoneType:
|
||||
new_pairs = new_pairs.where(RecipeIngredient.recipe_id.in_(recipe_ids))
|
||||
|
||||
new_pairs = new_pairs.group_by(groupable)\
|
||||
.cte()
|
||||
|
||||
return new_pairs
|
||||
|
||||
def pair_count_query(pairs, countable, recipe_ids = None):
|
||||
new_counts = select(pairs, func.count(func.distinct(countable)))
|
||||
|
||||
if not type(recipe_ids) == NoneType:
|
||||
new_counts = new_counts.where(or_(pairs[0].in_(recipe_ids),
|
||||
pairs[1].in_(recipe_ids)))
|
||||
|
||||
|
||||
new_counts = new_counts.group_by(pairs)
|
||||
|
||||
return new_counts
|
||||
|
||||
def update_graph_connectivity(session = None):
|
||||
# this is pure SQLAlchemy so it is more portable
|
||||
# This would have been simpler if I utilized Postgres specific feature
|
||||
if not session:
|
||||
session = Session(get_engine())
|
||||
|
||||
with session.begin():
|
||||
ids = select(Recipe.id)\
|
||||
.join(RecipeGraphed, isouter = True)\
|
||||
.where(RecipeGraphed.status.is_not(True))
|
||||
|
||||
num_recipes = session.execute(select(func.count('*')).select_from(ids.cte())).fetchone()[0]
|
||||
if num_recipes <= 0:
|
||||
logging.info("no new recipies")
|
||||
return
|
||||
|
||||
logging.info(f"adding {num_recipes} recipes to the graphs")
|
||||
|
||||
new_pairs = pair_query(RecipeIngredientParts.ingredient,
|
||||
RecipeIngredient.recipe_id,
|
||||
recipe_ids = ids)
|
||||
|
||||
|
||||
new_counts = pair_count_query(new_pairs.c.pair,
|
||||
new_pairs.c.recipe_id)
|
||||
|
||||
logging.info("addeing new ingredient connections")
|
||||
for pair, count in session.execute(new_counts):
|
||||
connection = session.query(IngredientConnection)\
|
||||
.where(and_(IngredientConnection.ingredient_a == pair[0],
|
||||
IngredientConnection.ingredient_b == pair[1]))\
|
||||
.first()
|
||||
if connection:
|
||||
connection.recipe_count += count
|
||||
session.merge(connection)
|
||||
else:
|
||||
session.add(IngredientConnection(ingredient_a = pair[0],
|
||||
ingredient_b = pair[1],
|
||||
recipe_count = count))
|
||||
|
||||
# update RecipeConnection
|
||||
logging.info("adding new recipe connections")
|
||||
all_pairs = pair_query(RecipeIngredient.recipe_id,
|
||||
RecipeIngredientParts.ingredient,
|
||||
pair_type=Integer)
|
||||
|
||||
new_counts = pair_count_query(all_pairs.c.pair,
|
||||
all_pairs.c.ingredient,
|
||||
recipe_ids=ids)
|
||||
|
||||
i = 0
|
||||
for pair, count in session.execute(new_counts):
|
||||
session.add(RecipeConnection(recipe_a = pair[0],
|
||||
recipe_b = pair[1],
|
||||
ingredient_count = count))
|
||||
# flush often to reduce memory usage
|
||||
i += 1
|
||||
if (i % 100000) == 0:
|
||||
session.flush()
|
||||
|
||||
# update RecipeGraphed.status
|
||||
logging.info("updating existing RecipeGraphed rows")
|
||||
for recipeGraphed in session.query(RecipeGraphed)\
|
||||
.where(RecipeGraphed.recipe_id.in_(ids)):
|
||||
recipeGraphed.status = True
|
||||
session.merge(recipeGraphed)
|
||||
|
||||
graphed = select(RecipeGraphed.recipe_id)
|
||||
|
||||
# add recipies that aren't in the table
|
||||
logging.info("adding new RecipeGraphed rows")
|
||||
for recipe in session.query(Recipe)\
|
||||
.where(and_(Recipe.id.in_(ids),
|
||||
not_(Recipe.id.in_(graphed)))):
|
||||
session.add(RecipeGraphed(recipe_id=recipe.id, status=True))
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
eng = get_engine()
|
||||
create_tables(eng)
|
||||
@@ -1,29 +0,0 @@
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import db
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
parser = argparse.ArgumentParser(description='Import recipes into database')
|
||||
parser.add_argument('file', type=str,
|
||||
help='JSON file with recipe site information')
|
||||
parser.add_argument('-v', '--verbose', action='store_true')
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
||||
|
||||
with open(args.file) as f:
|
||||
sites = json.load(f)
|
||||
|
||||
eng = db.get_engine()
|
||||
S = sessionmaker(eng)
|
||||
|
||||
with S.begin() as session:
|
||||
for site in sites:
|
||||
logging.info(f"Adding {site}")
|
||||
session.add(db.RecipeSite(**site))
|
||||
|
||||
|
||||
190
src/scrape.py
190
src/scrape.py
@@ -1,190 +0,0 @@
|
||||
from ast import alias
|
||||
from dis import Instruction
|
||||
import db
|
||||
import re
|
||||
from sqlalchemy import select, desc, exists, not_, except_
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
import bs4
|
||||
from urllib.request import urlopen
|
||||
from urllib.parse import urljoin
|
||||
import logging
|
||||
from argparse import ArgumentParser
|
||||
|
||||
def parse_ingredient(ingredient_text):
|
||||
|
||||
units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch',
|
||||
'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
|
||||
'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag',
|
||||
'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf',
|
||||
'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb',
|
||||
'year', 'fillet', 'litter', 'packet', 'slices']
|
||||
instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled',
|
||||
'chunky', 'small', 'medium', 'large', 'couarse', 'cracked',
|
||||
'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut',
|
||||
'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned',
|
||||
'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen',
|
||||
'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover',
|
||||
'light', 'lite', 'mashed', 'melted', 'minced', 'packed',
|
||||
'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated',
|
||||
'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted',
|
||||
'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin',
|
||||
'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled',
|
||||
'unopened', 'unseasoned']
|
||||
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
|
||||
ingredient_regex = '([a-zA-Z \'\-]+)'
|
||||
supplement_regex = ',?(.*)'
|
||||
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
|
||||
for unit in units])
|
||||
units_regex = f"((?:(?:{units_regex})e?s?)?)"
|
||||
instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}'
|
||||
for inst in instructions])
|
||||
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
|
||||
|
||||
regex = re.compile(number_regex +
|
||||
units_regex +
|
||||
instructions_regex +
|
||||
ingredient_regex +
|
||||
supplement_regex)
|
||||
|
||||
m = regex.match(ingredient_text)
|
||||
logging.info(f"Parsed {ingredient_text}, found: {m}")
|
||||
if not m:
|
||||
return None
|
||||
|
||||
return [text.strip() if text else None for text in m.groups()]
|
||||
|
||||
|
||||
|
||||
def reparse_ingredients(session):
|
||||
cte = (except_(select(db.RecipeIngredient.id),
|
||||
select(db.RecipeIngredientParts.id))).\
|
||||
alias('missing')
|
||||
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
||||
|
||||
for ingredient in missing:
|
||||
parts = parse_ingredient(ingredient.text)
|
||||
if not parts:
|
||||
continue
|
||||
quantity, unit, instruction, name, supplement = parts
|
||||
session.add(db.RecipeIngredientParts(id = ingredient.id,
|
||||
quantity = quantity,
|
||||
unit = unit,
|
||||
instruction = instruction,
|
||||
ingredient = name,
|
||||
supplement = supplement))
|
||||
|
||||
|
||||
|
||||
def load_recipe(recipe_url):
|
||||
try:
|
||||
logging.info(f'Loading Recipe: {recipe_url}')
|
||||
with urlopen(recipe_url) as f:
|
||||
if f.getcode() == 404:
|
||||
raise Exception(f"Recipe Does not exist: {recipe_url}")
|
||||
return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
||||
logging.warning(e)
|
||||
|
||||
return None
|
||||
|
||||
def parse_recipe(session, recipe, site):
|
||||
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
||||
recipe_page = load_recipe(recipe_url)
|
||||
if not recipe_page:
|
||||
return None
|
||||
|
||||
name_candidates = recipe_page.find_all(class_=site.name_class)
|
||||
if len(name_candidates) == 0:
|
||||
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
||||
name_div = name_candidates[0]
|
||||
recipe.name = name_div.text
|
||||
|
||||
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
|
||||
|
||||
session.add(recipe)
|
||||
session.flush()
|
||||
|
||||
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
||||
for candidate in ingred_candidates:
|
||||
ingred = db.RecipeIngredient(text=candidate.text,
|
||||
recipe_id=recipe.id)
|
||||
session.add(ingred)
|
||||
session.flush()
|
||||
|
||||
parts = parse_ingredient(ingred.text)
|
||||
if parts:
|
||||
quantity, unit, instruction,ingredient, supplement = parts
|
||||
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
|
||||
quantity = quantity,
|
||||
unit = unit,
|
||||
instruction = instruction,
|
||||
ingredient = ingredient,
|
||||
supplement = supplement)
|
||||
session.add(ingred_parts)
|
||||
|
||||
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
||||
|
||||
return recipe
|
||||
|
||||
|
||||
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
||||
parser.add_argument('site',
|
||||
help='Name of site')
|
||||
parser.add_argument('-id', '--identifier', dest='id',
|
||||
help='url of recipe(reletive to base url of site) or commma seperated list')
|
||||
parser.add_argument('-a', '--auto', action='store', dest='n',
|
||||
help='automaticaly generate identifier(must supply number of recipies to scrape)')
|
||||
parser.add_argument('-v', '--verbose', action='store_true')
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
||||
|
||||
eng = db.get_engine()
|
||||
S = sessionmaker(eng)
|
||||
|
||||
with S.begin() as sess:
|
||||
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
|
||||
site_id = site.id
|
||||
|
||||
recipe_ids = []
|
||||
starting_id = 0
|
||||
if args.id and not args.n:
|
||||
recipe_ids.append(args.id)
|
||||
logging.info(f'Retreiving single recipe: {args.id}')
|
||||
elif args.n:
|
||||
if not args.id:
|
||||
last_recipe = sess.query(db.Recipe).\
|
||||
where(db.Recipe.recipe_site_id == site.id).\
|
||||
order_by(desc(db.Recipe.identifier)).\
|
||||
limit(1).\
|
||||
scalar()
|
||||
starting_id = int(last_recipe.identifier) + 1
|
||||
else:
|
||||
starting_id = int(args.id)
|
||||
recipe_ids = range(starting_id, starting_id+int(args.n))
|
||||
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
|
||||
|
||||
|
||||
|
||||
for recipe_id in recipe_ids:
|
||||
try:
|
||||
savepoint = sess.begin_nested()
|
||||
|
||||
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
||||
parse_recipe(sess, recipe, site)
|
||||
|
||||
savepoint.commit()
|
||||
except KeyboardInterrupt as e:
|
||||
savepoint.rollback()
|
||||
break
|
||||
except Exception as e:
|
||||
savepoint.rollback()
|
||||
logging.error(e)
|
||||
continue
|
||||
|
||||
|
||||
Reference in New Issue
Block a user