2022-07-22 15:12:16 -04:00
|
|
|
from ast import alias
|
2022-07-18 11:13:53 -04:00
|
|
|
import db
|
2022-07-22 12:26:58 -04:00
|
|
|
import re
|
2022-07-22 15:12:16 -04:00
|
|
|
from sqlalchemy import select, desc, exists, not_, except_
|
2022-07-22 12:26:58 -04:00
|
|
|
from sqlalchemy.exc import IntegrityError
|
2022-07-18 11:13:53 -04:00
|
|
|
from sqlalchemy.orm import sessionmaker
|
|
|
|
|
import bs4
|
|
|
|
|
from urllib.request import urlopen
|
2022-07-22 12:26:58 -04:00
|
|
|
from urllib.parse import urljoin
|
2022-07-18 11:13:53 -04:00
|
|
|
import logging
|
|
|
|
|
from argparse import ArgumentParser
|
|
|
|
|
|
2022-07-22 12:26:58 -04:00
|
|
|
def parse_ingredient(ingredient_text):
|
2022-07-22 15:12:16 -04:00
|
|
|
units = ['teaspoon', 'tablespoon', 'gram', 'once', 'jar', 'cup', 'pinch',
|
|
|
|
|
'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
|
|
|
|
|
'bunch', 'quart', 'cube', 'envelope', 'squars', 'sprig']
|
|
|
|
|
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
|
2022-07-22 12:26:58 -04:00
|
|
|
ingredient_regex = '([a-zA-Z \'\-]+)'
|
|
|
|
|
supplement_regex = ',?(.*)'
|
|
|
|
|
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
|
|
|
|
|
for unit in units])
|
2022-07-22 15:12:16 -04:00
|
|
|
units_regex = f"((?:(?:{units_regex})e?s?)?)"
|
2022-07-22 12:26:58 -04:00
|
|
|
|
|
|
|
|
regex = re.compile(number_regex +
|
|
|
|
|
units_regex +
|
|
|
|
|
ingredient_regex +
|
|
|
|
|
supplement_regex)
|
|
|
|
|
|
|
|
|
|
m = regex.match(ingredient_text)
|
|
|
|
|
logging.info(f"Parsed {ingredient_text}, found: {m}")
|
|
|
|
|
if not m:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return [text.strip() for text in m.groups()]
|
|
|
|
|
|
2022-07-22 15:12:16 -04:00
|
|
|
def reparse_ingredients(session):
|
|
|
|
|
cte = (except_(select(db.RecipeIngredient.id),
|
|
|
|
|
select(db.RecipeIngredientParts.id))).\
|
|
|
|
|
alias('missing')
|
|
|
|
|
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
|
|
|
|
|
|
|
|
|
|
for ingredient in missing:
|
|
|
|
|
parts = parse_ingredient(ingredient.text)
|
|
|
|
|
if not parts:
|
|
|
|
|
continue
|
|
|
|
|
quantity, unit, name, supplement = parts
|
|
|
|
|
session.add(db.RecipeIngredientParts(id = ingredient.id,
|
|
|
|
|
quantity = quantity,
|
|
|
|
|
unit = unit,
|
|
|
|
|
ingredient = name,
|
|
|
|
|
supplement = supplement))
|
|
|
|
|
|
|
|
|
|
|
2022-07-22 12:26:58 -04:00
|
|
|
def load_recipe(recipe_url):
|
|
|
|
|
try:
|
|
|
|
|
logging.info(f'Loading Recipe: {recipe_url}')
|
|
|
|
|
with urlopen(recipe_url) as f:
|
|
|
|
|
if f.getcode() == 404:
|
|
|
|
|
raise Exception(f"Recipe Does not exist: {recipe_url}")
|
|
|
|
|
return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logging.warning(f"Could not download or parse recipe: {recipe_url}")
|
|
|
|
|
logging.warning(e)
|
|
|
|
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
def parse_recipe(session, recipe, site):
|
|
|
|
|
recipe_url = urljoin(site.base_url, str(recipe.identifier))
|
|
|
|
|
recipe_page = load_recipe(recipe_url)
|
|
|
|
|
if not recipe_page:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
name_candidates = recipe_page.find_all(class_=site.name_class)
|
|
|
|
|
if len(name_candidates) == 0:
|
|
|
|
|
raise Exception(f"Could not extract recipe name: {recipe_url}")
|
|
|
|
|
name_div = name_candidates[0]
|
|
|
|
|
recipe.name = name_div.text
|
|
|
|
|
|
|
|
|
|
logging.info(f"Adding Recipe {recipe}")
|
|
|
|
|
|
|
|
|
|
session.add(recipe)
|
|
|
|
|
session.flush()
|
|
|
|
|
|
|
|
|
|
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
|
|
|
|
|
for candidate in ingred_candidates:
|
|
|
|
|
ingred = db.RecipeIngredient(text=candidate.text,
|
|
|
|
|
recipe_id=recipe.id)
|
|
|
|
|
session.add(ingred)
|
|
|
|
|
session.flush()
|
|
|
|
|
|
|
|
|
|
parts = parse_ingredient(ingred.text)
|
|
|
|
|
if parts:
|
|
|
|
|
quantity, unit, ingredient, supplement = parts
|
|
|
|
|
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
|
|
|
|
|
quantity = quantity,
|
|
|
|
|
unit = unit,
|
|
|
|
|
ingredient = ingredient,
|
|
|
|
|
supplement = supplement)
|
|
|
|
|
session.add(ingred_parts)
|
|
|
|
|
|
|
|
|
|
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
|
2022-07-22 15:12:16 -04:00
|
|
|
|
2022-07-22 12:26:58 -04:00
|
|
|
return recipe
|
|
|
|
|
|
|
|
|
|
|
2022-07-18 11:13:53 -04:00
|
|
|
parser = ArgumentParser(description="Scrape a recipe site for recipies")
|
|
|
|
|
parser.add_argument('site',
|
|
|
|
|
help='Name of site')
|
2022-07-20 18:00:49 -04:00
|
|
|
parser.add_argument('-id', '--identifier', dest='id',
|
|
|
|
|
help='url of recipe(reletive to base url of site) or commma seperated list')
|
|
|
|
|
parser.add_argument('-a', '--auto', action='store', dest='n',
|
|
|
|
|
help='automaticaly generate identifier(must supply number of recipies to scrape)')
|
2022-07-18 11:13:53 -04:00
|
|
|
parser.add_argument('-v', '--verbose', action='store_true')
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
if args.verbose:
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
|
|
|
|
|
|
|
|
|
|
eng = db.get_engine()
|
|
|
|
|
S = sessionmaker(eng)
|
|
|
|
|
|
|
|
|
|
with S.begin() as sess:
|
2022-07-20 18:00:49 -04:00
|
|
|
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
|
|
|
|
|
site_id = site.id
|
|
|
|
|
|
|
|
|
|
recipe_ids = []
|
|
|
|
|
starting_id = 0
|
|
|
|
|
if args.id and not args.n:
|
|
|
|
|
recipe_ids.append(args.id)
|
|
|
|
|
logging.info(f'Retreiving single recipe: {args.id}')
|
|
|
|
|
elif args.n:
|
|
|
|
|
if not args.id:
|
|
|
|
|
last_recipe = sess.query(db.Recipe).\
|
|
|
|
|
where(db.Recipe.recipe_site_id == site.id).\
|
|
|
|
|
order_by(desc(db.Recipe.identifier)).\
|
|
|
|
|
limit(1).\
|
|
|
|
|
scalar()
|
|
|
|
|
starting_id = int(last_recipe.identifier) + 1
|
|
|
|
|
else:
|
2022-07-22 12:26:58 -04:00
|
|
|
starting_id = int(args.id)
|
2022-07-20 18:00:49 -04:00
|
|
|
recipe_ids = range(starting_id, starting_id+int(args.n))
|
|
|
|
|
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
|
2022-07-18 11:13:53 -04:00
|
|
|
|
2022-07-20 18:00:49 -04:00
|
|
|
|
2022-07-22 12:26:58 -04:00
|
|
|
|
2022-07-20 18:00:49 -04:00
|
|
|
for recipe_id in recipe_ids:
|
2022-07-22 12:26:58 -04:00
|
|
|
try:
|
|
|
|
|
savepoint = sess.begin_nested()
|
2022-07-20 18:00:49 -04:00
|
|
|
|
2022-07-22 12:26:58 -04:00
|
|
|
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
|
|
|
|
|
parse_recipe(sess, recipe, site)
|
|
|
|
|
|
|
|
|
|
savepoint.commit()
|
|
|
|
|
except KeyboardInterrupt as e:
|
|
|
|
|
savepoint.rollback()
|
|
|
|
|
break
|
2022-07-20 18:00:49 -04:00
|
|
|
except Exception as e:
|
2022-07-22 12:26:58 -04:00
|
|
|
savepoint.rollback()
|
|
|
|
|
logging.error(e)
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|