Compare commits

..

60 Commits

Author SHA1 Message Date
d4750c97f5 drone-ci testing
All checks were successful
continuous-integration/drone/pr Build is passing
2023-05-15 23:16:00 -04:00
7d57c91627 drone-ci testing
Some checks failed
continuous-integration/drone/pr Build is failing
2023-05-15 23:12:55 -04:00
0d6d761d9b drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 23:12:09 -04:00
201fb26e3f drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 23:04:56 -04:00
22640970de drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 23:02:54 -04:00
95df62b5fb drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 23:01:21 -04:00
44afed516c drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:57:39 -04:00
79f0e88e7d drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:54:59 -04:00
4a51622c7e drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 22:54:19 -04:00
c2aab348ca drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:51:19 -04:00
3f16f3a964 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:48:56 -04:00
13250151ea drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:45:30 -04:00
97b7c2b8ea drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:42:14 -04:00
86cb243ee6 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:42:02 -04:00
8b0e1800da drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:33:48 -04:00
eb607b99d9 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:33:10 -04:00
7886ada3ef drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:32:06 -04:00
1bc45bcd32 drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 22:30:49 -04:00
f88e61b463 drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 22:30:06 -04:00
c4bec20781 drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 22:24:37 -04:00
6115c165bd drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:23:08 -04:00
0d975b1479 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:21:56 -04:00
40f3ad24b9 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:19:55 -04:00
2801cef4be drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:18:11 -04:00
01630d2d5f drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:17:05 -04:00
b366a81716 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:16:14 -04:00
19d60c9d86 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:12:35 -04:00
20111f0e13 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:09:59 -04:00
477dfc9648 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 22:07:36 -04:00
d3db3ce673 drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 22:06:56 -04:00
a59a4580d1 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 22:05:00 -04:00
21f8696bac drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 21:57:35 -04:00
aeb7386180 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 21:56:35 -04:00
f7dc8100ee drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 21:55:38 -04:00
caebe6f284 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 21:54:51 -04:00
6530abaa56 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 21:53:02 -04:00
4dcd9aeada drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 21:52:27 -04:00
fff856afdc drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 18:09:12 -04:00
f2c76945c4 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 18:07:54 -04:00
40353e8c85 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 17:55:04 -04:00
d0a706433d drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 17:44:10 -04:00
285533ea61 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:47:07 -04:00
0be4294782 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:44:30 -04:00
e0aa886209 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:44:10 -04:00
100cf89a01 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:43:33 -04:00
88decd67e6 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:41:06 -04:00
219ad68e2d drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:38:29 -04:00
b6b1794252 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:35:09 -04:00
6f513001d3 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:15:25 -04:00
8c127b806d drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:11:18 -04:00
c66103223c drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:05:53 -04:00
accdd9449d drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:05:28 -04:00
779e55db74 drone-ci testing
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-15 12:04:30 -04:00
6331cb8bdf drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 12:03:03 -04:00
1364896616 drone-ci testing
Some checks failed
continuous-integration/drone/push Build is failing
2023-05-15 12:02:16 -04:00
d46909eaa9 drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 11:32:05 -04:00
08dc04f32d drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 11:30:25 -04:00
615f72b9cc drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 11:28:09 -04:00
2d59b2f2ee drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 11:27:02 -04:00
01b8a2be62 drone-ci testing
Some checks failed
continuous-integration/drone/push Build encountered an error
2023-05-15 11:25:26 -04:00
9 changed files with 149 additions and 357 deletions

View File

@@ -6,7 +6,7 @@ environment:
trigger: trigger:
event: event:
include: include:
- push - pull_request
steps: steps:
- name: db-up - name: db-up
@@ -51,7 +51,6 @@ steps:
- export POSTGRES_URL=$hostip - export POSTGRES_URL=$hostip
- . .venv/bin/activate - . .venv/bin/activate
- pytest - pytest
- name: db-cleanup - name: db-cleanup
image: docker/compose:alpine-1.29.2 image: docker/compose:alpine-1.29.2
volumes: volumes:

View File

@@ -88,13 +88,9 @@ docker-compose -p recipe-test up
running tests running tests
```sh ```sh
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html pytest
``` ```
The html report is under `htmlcov/` and can be viewed through any browser.
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
plugin for VS Code to view coverage in your editor.
**WARNINING**: If you get `ERROR at setup of test_db_connection` and **WARNINING**: If you get `ERROR at setup of test_db_connection` and
`ERROR at setup of test_db_class_creation`, please check if testing database is `ERROR at setup of test_db_class_creation`, please check if testing database is
already initiated. Testing is destructive and should be done on a fresh database. already initiated. Testing is destructive and should be done on a fresh database.
@@ -106,8 +102,18 @@ docker-compose -p recipe-test down
``` ```
Test are written in pytest framework. Currently focused on unittest and code Test are written in pytest framework. Currently focused on unittest.
coverage. Integration tests to come. Integration tests to come.
To run test use:
```
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
```
The html report is under `htmlcov/` and can be viewed through any browser.
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
plugin for VS Code to view coverage in your editor.
## TODO ## TODO
> ☑ automate scraping\ > ☑ automate scraping\

View File

@@ -2,14 +2,5 @@
requires = ["setuptools>=61.0"] requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[project] [metadata]
name = "recipe_graph" name = "recepie_graph"
version = "0.0.1"
description = "mapping out recipes relations"
dependencies = [
"SQLAlchemy==1.4.39",
"python-dotenv==0.20.0",
"beautifulsoup4==4.11.1",
"psycopg2-binary==2.9.3",
"requests~=2.30.0"
]

View File

@@ -12,7 +12,6 @@ pyparsing==3.0.9
pytest==7.1.3 pytest==7.1.3
pytest-cov==4.0.0 pytest-cov==4.0.0
python-dotenv==0.20.0 python-dotenv==0.20.0
requests~=2.30.0
soupsieve==2.3.2.post1 soupsieve==2.3.2.post1
SQLAlchemy==1.4.39 SQLAlchemy==1.4.39
tomli==2.0.1 tomli==2.0.1

View File

@@ -1,5 +1,5 @@
from pydoc import apropos from pydoc import apropos
from sqlalchemy.orm import Session from sqlalchemy.orm import sessionmaker
from recipe_graph import db from recipe_graph import db
import json import json
import argparse import argparse
@@ -7,7 +7,7 @@ import logging
import sys import sys
def load_file(f_name: str) -> list[dict[str, any]]: def load_file(f_name: str):
with open(f_name) as f: with open(f_name) as f:
sites = json.load(f) sites = json.load(f)
return sites return sites
@@ -29,18 +29,6 @@ def setup_logging(args: argparse.Namespace) -> logging.Logger:
return logger return logger
def add_sites(
S: Session,
sites: list[dict[str, any]],
logger: logging.Logger = None,
):
with S.begin() as session:
for site in sites:
if logger: # pragma: no cover
logger.info(f"Adding {site}")
session.add(db.RecipeSite(**site))
def main(): # pragma: no cover def main(): # pragma: no cover
args = setup_argparser(sys.argv[1:]) args = setup_argparser(sys.argv[1:])
logger = setup_logging(args) logger = setup_logging(args)
@@ -48,7 +36,10 @@ def main(): # pragma: no cover
S = db.get_session() S = db.get_session()
sites = load_file(args.file) sites = load_file(args.file)
add_sites(S, sites, logger) with S.begin() as session:
for site in sites:
logger.info(f"Adding {site}")
session.add(db.RecipeSite(**site))
if __name__ == "__main__": # pragma: no cover if __name__ == "__main__": # pragma: no cover

View File

@@ -4,58 +4,46 @@ import re
from sqlalchemy import select, desc, exists, not_, except_ from sqlalchemy import select, desc, exists, not_, except_
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
import bs4 import bs4
import requests as req from urllib.request import urlopen
from urllib.parse import urljoin from urllib.parse import urljoin
import logging import logging
from argparse import ArgumentParser from argparse import ArgumentParser
def parse_ingredient(ingredient_text):
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern: units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch',
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)" 'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
ingredient_regex = "([a-zA-Z '\-]+)" 'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag',
supplement_regex = ",?(.*)" 'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf',
units_regex = "|".join( 'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb',
[f"[{unit[0]}{unit[0].capitalize()}]{unit[1:]}" for unit in units] 'year', 'fillet', 'litter', 'packet', 'slices']
) instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled',
'chunky', 'small', 'medium', 'large', 'couarse', 'cracked',
'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut',
'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned',
'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen',
'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover',
'light', 'lite', 'mashed', 'melted', 'minced', 'packed',
'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated',
'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted',
'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin',
'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled',
'unopened', 'unseasoned']
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
ingredient_regex = '([a-zA-Z \'\-]+)'
supplement_regex = ',?(.*)'
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
for unit in units])
units_regex = f"((?:(?:{units_regex})e?s?)?)" units_regex = f"((?:(?:{units_regex})e?s?)?)"
instructions_regex = "|".join( instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}'
[f"[{inst[0]}{inst[0].capitalize()}]{inst[1:]}" for inst in instructions] for inst in instructions])
)
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)" instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
return re.compile( regex = re.compile(number_regex +
number_regex units_regex +
+ units_regex instructions_regex +
+ instructions_regex ingredient_regex +
+ ingredient_regex supplement_regex)
+ supplement_regex
)
# TODO: load units and instructions from config.
# Moved data into optional parameters for the time being.
def parse_ingredient(
ingredient_text: str,
units: list[str] = [ "teaspoon", "tablespoon", "gram", "ounce", "jar",
"cup", "pinch", "container", "slice", "package",
"pound", "can", "dash", "spear", "bunch", "quart",
"cube", "envelope", "square", "sprig", "bag", "box",
"drop", "fluid ounce", "gallon", "head", "link",
"loaf", "pint", "pod", "sheet", "stalk", "whole",
"bar", "bottle", "bulb", "year", "fillet", "litter",
"packet", "slices"],
instructions: list[str] = [
"and", "or", "chopped", "diced", "brewed", "chilled", "chunky", "small",
"medium", "large", "couarse", "cracked", "crushed", "ground", "cooked",
"cubed", "crumbled", "cut", "cold", "hot", "warm", "day", "old",
"drained", "canned", "dried", "dry", "fine", "firm", "fresh", "frozen",
"grated", "grilled", "hard", "hot", "juliened?", "leftover", "light",
"lite", "mashed", "melted", "minced", "packed", "peeled", "pitted",
"sliced", "prepared", "refrigerated", "rehydrated", "seedless", "shaved",
"shredded", "sifted", "sieved", "shucked", "slivered", "thick", "sliced",
"thin", "toasted", "trimmed", "unbaked", "uncooked", "unpeeled",
"unopened", "unseasoned"],
):
regex = ingredient_regex(units, instructions)
m = regex.match(ingredient_text) m = regex.match(ingredient_text)
logging.info(f"Parsed {ingredient_text}, found: {m}") logging.info(f"Parsed {ingredient_text}, found: {m}")
@@ -64,120 +52,95 @@ def parse_ingredient(
return [text.strip() if text else None for text in m.groups()] return [text.strip() if text else None for text in m.groups()]
# this code is unused
# TODO: add tests when this is used
def missing_ingredients_query(session):
cte = (
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
).alias("missing")
missing = (
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
)
return missing
# this code is unused
# TODO: add tests when this is used def reparse_ingredients(session):
def parse_missing_ingredients(session): cte = (except_(select(db.RecipeIngredient.id),
missing = missing_ingredients_query(session) select(db.RecipeIngredientParts.id))).\
alias('missing')
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
for ingredient in missing: for ingredient in missing:
parts = ingredient_to_parts(ingredient) parts = parse_ingredient(ingredient.text)
session.add(parts) if not parts:
continue
quantity, unit, instruction, name, supplement = parts
session.add(db.RecipeIngredientParts(id = ingredient.id,
quantity = quantity,
unit = unit,
instruction = instruction,
ingredient = name,
supplement = supplement))
def load_page(recipe_url: str) -> bs4.BeautifulSoup:
def load_recipe(recipe_url):
try: try:
logging.info(f"Loading Page: {recipe_url}") logging.info(f'Loading Recipe: {recipe_url}')
with req.get(recipe_url) as resp: with urlopen(recipe_url) as f:
if resp.status_code == 404: if f.getcode() == 404:
raise Exception(f"Page does not exist (404): {recipe_url}") raise Exception(f"Recipe Does not exist: {recipe_url}")
return bs4.BeautifulSoup(resp.text, "html.parser") return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
except Exception as e: except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}") logging.warning(f"Could not download or parse recipe: {recipe_url}")
logging.warning(e) logging.warning(e)
return None
def parse_recipe_name(
site: db.RecipeSite,
page: bs4.BeautifulSoup,
recipe: db.Recipe,
url: str = None,
) -> db.Recipe:
if not url:
url = {"site": site.base_url, "recipe": recipe.identifier}
name_candidates = page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {url}")
name_div = name_candidates[0]
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe.name} from {url}")
return recipe
def ingredient_to_parts(
ingredient: db.Ingredient
) -> db.RecipeIngredientParts:
parts = parse_ingredient(ingredient.text)
if parts:
quantity, unit, instruction, ingredient_name, supplement = parts
return db.RecipeIngredientParts(
id=ingredient.id,
quantity=quantity,
unit=unit,
instruction=instruction,
ingredient=ingredient_name,
supplement=supplement,
)
def parse_recipe(session, recipe, site): def parse_recipe(session, recipe, site):
recipe_url = urljoin(site.base_url, str(recipe.identifier)) recipe_url = urljoin(site.base_url, str(recipe.identifier))
recipe_page = load_page(recipe_url) recipe_page = load_recipe(recipe_url)
if not recipe_page: if not recipe_page:
return None return None
recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url) name_candidates = recipe_page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
name_div = name_candidates[0]
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
session.add(recipe) session.add(recipe)
session.flush() session.flush()
candidates = recipe_page.find_all(class_=site.ingredient_class) ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in candidates: for candidate in ingred_candidates:
ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id) ingred = db.RecipeIngredient(text=candidate.text,
session.add(ingredient) recipe_id=recipe.id)
session.add(ingred)
session.flush() session.flush()
parts = ingredient_to_parts(ingredient) parts = parse_ingredient(ingred.text)
if parts: if parts:
session.add(parts) quantity, unit, instruction,ingredient, supplement = parts
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
quantity = quantity,
unit = unit,
instruction = instruction,
ingredient = ingredient,
supplement = supplement)
session.add(ingred_parts)
logging.info(f"{len(candidates)} ingredients found. Inserting into DB") logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
return recipe return recipe
def main():
def main(): # pragma: no cover
parser = ArgumentParser(description="Scrape a recipe site for recipies") parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument("site", help="Name of site") parser.add_argument('site',
parser.add_argument( help='Name of site')
"-id", parser.add_argument('-id', '--identifier', dest='id',
"--identifier", help='url of recipe(reletive to base url of site) or commma seperated list')
dest="id", parser.add_argument('-a', '--auto', action='store', dest='n',
help="url of recipe(reletive to base url of site) or commma seperated list", help='automaticaly generate identifier(must supply number of recipies to scrape)')
) parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument(
"-a",
"--auto",
action="store",
dest="n",
help="automaticaly generate identifier(must supply number of recipies to scrape)",
)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args(sys.argv) args = parser.parse_args(sys.argv)
if args.verbose: if args.verbose:
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
eng = db.get_engine() eng = db.get_engine()
S = sessionmaker(eng) S = sessionmaker(eng)
@@ -190,29 +153,27 @@ def main(): # pragma: no cover
starting_id = 0 starting_id = 0
if args.id and not args.n: if args.id and not args.n:
recipe_ids.append(args.id) recipe_ids.append(args.id)
logging.info(f"Retreiving single recipe: {args.id}") logging.info(f'Retreiving single recipe: {args.id}')
elif args.n: elif args.n:
if not args.id: if not args.id:
last_recipe = ( last_recipe = sess.query(db.Recipe).\
sess.query(db.Recipe) where(db.Recipe.recipe_site_id == site.id).\
.where(db.Recipe.recipe_site_id == site.id) order_by(desc(db.Recipe.identifier)).\
.order_by(desc(db.Recipe.identifier)) limit(1).\
.limit(1) scalar()
.scalar()
)
starting_id = int(last_recipe.identifier) + 1 starting_id = int(last_recipe.identifier) + 1
else: else:
starting_id = int(args.id) starting_id = int(args.id)
recipe_ids = range(starting_id, starting_id + int(args.n)) recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info( logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
f"Retreving {args.n} recipes from {site.base_url} starting at {starting_id}"
)
for recipe_id in recipe_ids: for recipe_id in recipe_ids:
try: try:
savepoint = sess.begin_nested() savepoint = sess.begin_nested()
recipe = db.Recipe(identifier=recipe_id, recipe_site_id=site.id) recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
parse_recipe(sess, recipe, site) parse_recipe(sess, recipe, site)
savepoint.commit() savepoint.commit()

View File

@@ -8,7 +8,7 @@ import sqlalchemy
import pytest import pytest
@pytest.fixture() @pytest.fixture
def engine() -> sqlalchemy.engine.Engine: def engine() -> sqlalchemy.engine.Engine:
engine = db.get_engine() engine = db.get_engine()
# make sure db is empty otherwise might be testing a live db # make sure db is empty otherwise might be testing a live db

View File

@@ -1,50 +1,26 @@
import json import json
import os import os
from recipe_graph import insert_sites, db from recipe_graph import insert_sites
from sqlalchemy import select from sqlalchemy import select
import sqlalchemy
import logging import logging
import pytest import pytest
from test_db import engine, init_db
@pytest.fixture @pytest.fixture
def json_data() -> list[dict[str, any]]: def json_data() -> list[dict]:
return [{"key": "value"}, {"test": "value1", "test2": "value2"}] return [{"key": "value"}, {"test": "value1", "test2": "value2"}]
@pytest.fixture
def db_initialized(engine) -> sqlalchemy.engine.Engine:
init_db(engine)
return engine
@pytest.fixture
def mock_sites() -> list[dict[str, any]]:
return [
{
"name": "example-site",
"ingredient_class": "example-item-name",
"name_class": "example-content",
"base_url": "https://www.example.com/recipe/",
},
{
"name": "test-site",
"ingredient_class": "test-item-name",
"name_class": "test-content",
"base_url": "https://www.test.com/recipe/",
},
]
@pytest.fixture @pytest.fixture
def json_file(json_data: list[dict]) -> str: def json_file(json_data: list[dict]) -> str:
f_path = "test.json" f_path = "test.json"
with open(f_path, "w") as f: with open(f_path, 'w') as f:
json.dump(json_data, f) json.dump(json_data, f)
yield f_path yield f_path
if os.path.exists(f_path): if os.path.exists(f_path):
os.remove(f_path) os.remove(f_path)
def test_load_file(json_file: str, json_data): def test_load_file(json_file: str, json_data):
test_data = insert_sites.load_file(json_file) test_data = insert_sites.load_file(json_file)
assert test_data == json_data assert test_data == json_data
@@ -57,6 +33,7 @@ def test_setup_argparser():
assert args.file == file_name assert args.file == file_name
assert args.verbose == False assert args.verbose == False
args = insert_sites.setup_argparser([file_name, "-v"]) args = insert_sites.setup_argparser([file_name, "-v"])
assert args.file == file_name assert args.file == file_name
assert args.verbose == True assert args.verbose == True
@@ -65,7 +42,6 @@ def test_setup_argparser():
assert args.file == file_name assert args.file == file_name
assert args.verbose == True assert args.verbose == True
def test_setup_logging(): def test_setup_logging():
args = insert_sites.setup_argparser(["test"]) args = insert_sites.setup_argparser(["test"])
logger = insert_sites.setup_logging(args) logger = insert_sites.setup_logging(args)
@@ -79,22 +55,3 @@ def test_setup_logging():
logger = insert_sites.setup_logging(args) logger = insert_sites.setup_logging(args)
assert logger.level == logging.INFO assert logger.level == logging.INFO
def test_add_sites(mock_sites, db_initialized):
db_session = db.get_session()
insert_sites.add_sites(db_session, mock_sites)
results = []
with db_session.begin() as session:
results = session.execute(select(db.RecipeSite)).all()
assert len(results) > 0
assert len(results) == 2
print(db.RecipeSite(name="a"))
for i, (site,) in enumerate(results):
site.name == mock_sites[i]["name"]
site.ingredient_class == mock_sites[i]["ingredient_class"]
site.name_class == mock_sites[i]["name_class"]
site.base_url == mock_sites[i]["base_url"]

View File

@@ -1,115 +1,3 @@
from recipe_graph import scrape from recipe_graph import scrape
from bs4 import BeautifulSoup
from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
from pytest import fixture
@fixture
def mock_site():
return RecipeSite(
name="mock-site",
ingredient_class="mock-ing",
name_class="mock-name",
base_url="example-site/mock-site",
)
# TODO: should probably load HTML from file
@fixture
def mock_page():
return BeautifulSoup(
"""
<header></header><body>
<div class="mock-name">test_recipe</div>
<div class="mock-ing">test_ingredient</div>
</body>
""",
"html.parser",
)
@fixture
def mock_blank_page():
return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
@fixture
def mock_recipe():
return Recipe(name="test_recipe", identifier="mock_1")
@fixture
def mock_ingredient():
return RecipeIngredient(text="1 ounce water")
@fixture
def mock_url():
return "example-site/mock-site"
def test_load_page():
page = scrape.load_page("https://www.google.com")
assert type(page) == BeautifulSoup
page = scrape.load_page("https://www.google.com/some-nonsense")
assert page == None
def test_ingredient_regex():
regex = scrape.ingredient_regex(["cup"], ["crushed"])
assert (
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up)e?s?)?)((?:(?:(?:[cC]rushed)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)
regex = scrape.ingredient_regex(["cup", "ounce"], ["crushed", "ground"])
assert (
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)
def test_parse_ingredient(mock_ingredient):
parts = scrape.parse_ingredient(mock_ingredient.text)
assert len(parts) > 0
assert parts == ['1', 'ounce', '', 'water', None]
parts = scrape.parse_ingredient("Water")
assert len(parts) > 0
assert parts == [None, None, None, 'Water', None]
parts = scrape.parse_ingredient("")
assert parts == None
def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url, mock_blank_page,):
expected_name = mock_recipe.name
mock_recipe.name = None
mock_recipe = scrape.parse_recipe_name(
mock_site,
mock_page,
mock_recipe,
)
assert mock_recipe.name == expected_name
ex = None
try:
mock_recipe = scrape.parse_recipe_name(
mock_site,
mock_blank_page,
mock_recipe,
)
except Exception as e:
ex = e
url = {"site": mock_site.base_url, "recipe": mock_recipe.identifier}
assert str(e) == f"Could not extract recipe name: {url}"
assert ex
def test_ingredient_to_parts(mock_ingredient):
parts = scrape.ingredient_to_parts(mock_ingredient)
assert parts.quantity == "1"
assert parts.unit == "ounce"
assert parts.instruction == ""
assert parts.ingredient == "water"
assert parts.supplement == None
import pytest