Compare commits

...

15 Commits

Author SHA1 Message Date
c4d5b3a7bf commented unused code
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-19 18:08:47 -04:00
9d0413ada5 refactor
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-19 17:09:57 -04:00
a04bb06ed8 added parse_ingredient test
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-19 01:04:52 -04:00
cf05777f2c increassed code coverage
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 23:52:20 -04:00
209597432d added ingrediet_to_parts tests
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 23:31:42 -04:00
e207c359ed test parse_recipe_name
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 19:05:34 -04:00
35fadd6638 changed test addresses to google
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 18:36:11 -04:00
259c08fd4e added test for creating regex
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 16:02:14 -04:00
e6d150421f updated readme 2023-05-18 11:36:17 -04:00
76e0438062 moved from urllib to requests
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-18 09:09:12 -04:00
794dbe7d88 updated readme about testing 2023-05-17 22:15:53 -04:00
b6daacca2d test load_recipe
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-17 22:01:19 -04:00
719b544007 updated ci triggers
All checks were successful
continuous-integration/drone/push Build is passing
2023-05-16 17:26:59 -04:00
03ecae4be5 insert_sites tests done 2023-05-16 17:24:06 -04:00
5201a444e9 added requirements to package 2023-05-16 14:57:43 -04:00
9 changed files with 356 additions and 149 deletions

View File

@@ -6,7 +6,7 @@ environment:
trigger:
event:
include:
- pull_request
- push
steps:
- name: db-up

View File

@@ -88,9 +88,13 @@ docker-compose -p recipe-test up
running tests
```sh
pytest
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
```
The html report is under `htmlcov/` and can be viewed through any browser.
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
plugin for VS Code to view coverage in your editor.
**WARNINING**: If you get `ERROR at setup of test_db_connection` and
`ERROR at setup of test_db_class_creation`, please check if testing database is
already initiated. Testing is destructive and should be done on a fresh database.
@@ -102,18 +106,8 @@ docker-compose -p recipe-test down
```
Test are written in pytest framework. Currently focused on unittest.
Integration tests to come.
To run test use:
```
pytest --cov=src/recipe_graph --cov-report lcov --cov-report html
```
The html report is under `htmlcov/` and can be viewed through any browser.
The `lcov` file can be used for the [Coverage Gutters](https://marketplace.visualstudio.com/items?itemName=ryanluker.vscode-coverage-gutters)
plugin for VS Code to view coverage in your editor.
Test are written in pytest framework. Currently focused on unittest and code
coverage. Integration tests to come.
## TODO
> ☑ automate scraping\

View File

@@ -2,5 +2,14 @@
requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[metadata]
name = "recepie_graph"
[project]
name = "recipe_graph"
version = "0.0.1"
description = "mapping out recipes relations"
dependencies = [
"SQLAlchemy==1.4.39",
"python-dotenv==0.20.0",
"beautifulsoup4==4.11.1",
"psycopg2-binary==2.9.3",
"requests~=2.30.0"
]

View File

@@ -12,6 +12,7 @@ pyparsing==3.0.9
pytest==7.1.3
pytest-cov==4.0.0
python-dotenv==0.20.0
requests~=2.30.0
soupsieve==2.3.2.post1
SQLAlchemy==1.4.39
tomli==2.0.1

View File

@@ -1,5 +1,5 @@
from pydoc import apropos
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm import Session
from recipe_graph import db
import json
import argparse
@@ -7,7 +7,7 @@ import logging
import sys
def load_file(f_name: str):
def load_file(f_name: str) -> list[dict[str, any]]:
with open(f_name) as f:
sites = json.load(f)
return sites
@@ -29,18 +29,27 @@ def setup_logging(args: argparse.Namespace) -> logging.Logger:
return logger
def main(): # pragma: no cover
def add_sites(
S: Session,
sites: list[dict[str, any]],
logger: logging.Logger = None,
):
with S.begin() as session:
for site in sites:
if logger: # pragma: no cover
logger.info(f"Adding {site}")
session.add(db.RecipeSite(**site))
def main(): # pragma: no cover
args = setup_argparser(sys.argv[1:])
logger = setup_logging(args)
S = db.get_session()
sites = load_file(args.file)
with S.begin() as session:
for site in sites:
logger.info(f"Adding {site}")
session.add(db.RecipeSite(**site))
add_sites(S, sites, logger)
if __name__ == "__main__": # pragma: no cover
if __name__ == "__main__": # pragma: no cover
main()

View File

@@ -4,47 +4,59 @@ import re
from sqlalchemy import select, desc, exists, not_, except_
from sqlalchemy.orm import sessionmaker
import bs4
from urllib.request import urlopen
import requests as req
from urllib.parse import urljoin
import logging
from argparse import ArgumentParser
def parse_ingredient(ingredient_text):
units = ['teaspoon', 'tablespoon', 'gram', 'ounce', 'jar', 'cup', 'pinch',
'container', 'slice', 'package', 'pound', 'can', 'dash', 'spear',
'bunch', 'quart', 'cube', 'envelope', 'square', 'sprig', 'bag',
'box', 'drop', 'fluid ounce', 'gallon', 'head', 'link', 'loaf',
'pint', 'pod', 'sheet', 'stalk', 'whole', 'bar', 'bottle', 'bulb',
'year', 'fillet', 'litter', 'packet', 'slices']
instructions = ['and', 'or', 'chopped', 'diced', 'brewed', 'chilled',
'chunky', 'small', 'medium', 'large', 'couarse', 'cracked',
'crushed', 'ground', 'cooked', 'cubed', 'crumbled', 'cut',
'cold', 'hot', 'warm', 'day', 'old', 'drained', 'canned',
'dried', 'dry', 'fine', 'firm', 'fresh', 'frozen',
'grated', 'grilled', 'hard', 'hot', 'juliened?', 'leftover',
'light', 'lite', 'mashed', 'melted', 'minced', 'packed',
'peeled', 'pitted', 'sliced', 'prepared', 'refrigerated',
'rehydrated', 'seedless', 'shaved', 'shredded', 'sifted',
'sieved', 'shucked', 'slivered', 'thick', 'sliced', 'thin',
'toasted', 'trimmed', 'unbaked', 'uncooked', 'unpeeled',
'unopened', 'unseasoned']
number_regex = '((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)'
ingredient_regex = '([a-zA-Z \'\-]+)'
supplement_regex = ',?(.*)'
units_regex = "|".join([f'[{unit[0]}{unit[0].capitalize()}]{unit[1:]}'
for unit in units])
def ingredient_regex(units: list[str], instructions: list[str]) -> re.Pattern:
number_regex = "((?:[\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\s?(?:\(.+\))?)*)"
ingredient_regex = "([a-zA-Z '\-]+)"
supplement_regex = ",?(.*)"
units_regex = "|".join(
[f"[{unit[0]}{unit[0].capitalize()}]{unit[1:]}" for unit in units]
)
units_regex = f"((?:(?:{units_regex})e?s?)?)"
instructions_regex = "|".join([f'[{inst[0]}{inst[0].capitalize()}]{inst[1:]}'
for inst in instructions])
instructions_regex = "|".join(
[f"[{inst[0]}{inst[0].capitalize()}]{inst[1:]}" for inst in instructions]
)
instructions_regex = f"((?:(?:(?:{instructions_regex})(?:ly)?)| )*)"
regex = re.compile(number_regex +
units_regex +
instructions_regex +
ingredient_regex +
supplement_regex)
return re.compile(
number_regex
+ units_regex
+ instructions_regex
+ ingredient_regex
+ supplement_regex
)
# TODO: load units and instructions from config.
# Moved data into optional parameters for the time being.
def parse_ingredient(
ingredient_text: str,
units: list[str] = [ "teaspoon", "tablespoon", "gram", "ounce", "jar",
"cup", "pinch", "container", "slice", "package",
"pound", "can", "dash", "spear", "bunch", "quart",
"cube", "envelope", "square", "sprig", "bag", "box",
"drop", "fluid ounce", "gallon", "head", "link",
"loaf", "pint", "pod", "sheet", "stalk", "whole",
"bar", "bottle", "bulb", "year", "fillet", "litter",
"packet", "slices"],
instructions: list[str] = [
"and", "or", "chopped", "diced", "brewed", "chilled", "chunky", "small",
"medium", "large", "couarse", "cracked", "crushed", "ground", "cooked",
"cubed", "crumbled", "cut", "cold", "hot", "warm", "day", "old",
"drained", "canned", "dried", "dry", "fine", "firm", "fresh", "frozen",
"grated", "grilled", "hard", "hot", "juliened?", "leftover", "light",
"lite", "mashed", "melted", "minced", "packed", "peeled", "pitted",
"sliced", "prepared", "refrigerated", "rehydrated", "seedless", "shaved",
"shredded", "sifted", "sieved", "shucked", "slivered", "thick", "sliced",
"thin", "toasted", "trimmed", "unbaked", "uncooked", "unpeeled",
"unopened", "unseasoned"],
):
regex = ingredient_regex(units, instructions)
m = regex.match(ingredient_text)
logging.info(f"Parsed {ingredient_text}, found: {m}")
if not m:
@@ -52,95 +64,120 @@ def parse_ingredient(ingredient_text):
return [text.strip() if text else None for text in m.groups()]
# this code is unused
# TODO: add tests when this is used
def missing_ingredients_query(session):
cte = (
except_(select(db.RecipeIngredient.id), select(db.RecipeIngredientParts.id))
).alias("missing")
missing = (
session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
)
return missing
def reparse_ingredients(session):
cte = (except_(select(db.RecipeIngredient.id),
select(db.RecipeIngredientParts.id))).\
alias('missing')
missing = session.query(db.RecipeIngredient).where(db.RecipeIngredient.id.in_(cte)).all()
# this code is unused
# TODO: add tests when this is used
def parse_missing_ingredients(session):
missing = missing_ingredients_query(session)
for ingredient in missing:
parts = parse_ingredient(ingredient.text)
if not parts:
continue
quantity, unit, instruction, name, supplement = parts
session.add(db.RecipeIngredientParts(id = ingredient.id,
quantity = quantity,
unit = unit,
instruction = instruction,
ingredient = name,
supplement = supplement))
parts = ingredient_to_parts(ingredient)
session.add(parts)
def load_recipe(recipe_url):
try:
logging.info(f'Loading Recipe: {recipe_url}')
with urlopen(recipe_url) as f:
if f.getcode() == 404:
raise Exception(f"Recipe Does not exist: {recipe_url}")
return bs4.BeautifulSoup(f.read().decode(), 'html.parser')
def load_page(recipe_url: str) -> bs4.BeautifulSoup:
try:
logging.info(f"Loading Page: {recipe_url}")
with req.get(recipe_url) as resp:
if resp.status_code == 404:
raise Exception(f"Page does not exist (404): {recipe_url}")
return bs4.BeautifulSoup(resp.text, "html.parser")
except Exception as e:
logging.warning(f"Could not download or parse recipe: {recipe_url}")
logging.warning(e)
return None
def parse_recipe(session, recipe, site):
recipe_url = urljoin(site.base_url, str(recipe.identifier))
recipe_page = load_recipe(recipe_url)
if not recipe_page:
return None
name_candidates = recipe_page.find_all(class_=site.name_class)
def parse_recipe_name(
site: db.RecipeSite,
page: bs4.BeautifulSoup,
recipe: db.Recipe,
url: str = None,
) -> db.Recipe:
if not url:
url = {"site": site.base_url, "recipe": recipe.identifier}
name_candidates = page.find_all(class_=site.name_class)
if len(name_candidates) == 0:
raise Exception(f"Could not extract recipe name: {recipe_url}")
raise Exception(f"Could not extract recipe name: {url}")
name_div = name_candidates[0]
recipe.name = name_div.text
logging.info(f"Adding Recipe {recipe.name} from {recipe_url}")
logging.info(f"Adding Recipe {recipe.name} from {url}")
return recipe
def ingredient_to_parts(
ingredient: db.Ingredient
) -> db.RecipeIngredientParts:
parts = parse_ingredient(ingredient.text)
if parts:
quantity, unit, instruction, ingredient_name, supplement = parts
return db.RecipeIngredientParts(
id=ingredient.id,
quantity=quantity,
unit=unit,
instruction=instruction,
ingredient=ingredient_name,
supplement=supplement,
)
def parse_recipe(session, recipe, site):
recipe_url = urljoin(site.base_url, str(recipe.identifier))
recipe_page = load_page(recipe_url)
if not recipe_page:
return None
recipe = parse_recipe_name(site, recipe_page, recipe, recipe_url)
session.add(recipe)
session.flush()
ingred_candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in ingred_candidates:
ingred = db.RecipeIngredient(text=candidate.text,
recipe_id=recipe.id)
session.add(ingred)
candidates = recipe_page.find_all(class_=site.ingredient_class)
for candidate in candidates:
ingredient = db.RecipeIngredient(text=candidate.text, recipe_id=recipe.id)
session.add(ingredient)
session.flush()
parts = parse_ingredient(ingred.text)
if parts:
quantity, unit, instruction,ingredient, supplement = parts
ingred_parts = db.RecipeIngredientParts(id = ingred.id,
quantity = quantity,
unit = unit,
instruction = instruction,
ingredient = ingredient,
supplement = supplement)
session.add(ingred_parts)
logging.info(f"{len(ingred_candidates)} ingredients found. Inserting into DB")
parts = ingredient_to_parts(ingredient)
if parts:
session.add(parts)
logging.info(f"{len(candidates)} ingredients found. Inserting into DB")
return recipe
def main():
def main(): # pragma: no cover
parser = ArgumentParser(description="Scrape a recipe site for recipies")
parser.add_argument('site',
help='Name of site')
parser.add_argument('-id', '--identifier', dest='id',
help='url of recipe(reletive to base url of site) or commma seperated list')
parser.add_argument('-a', '--auto', action='store', dest='n',
help='automaticaly generate identifier(must supply number of recipies to scrape)')
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument("site", help="Name of site")
parser.add_argument(
"-id",
"--identifier",
dest="id",
help="url of recipe(reletive to base url of site) or commma seperated list",
)
parser.add_argument(
"-a",
"--auto",
action="store",
dest="n",
help="automaticaly generate identifier(must supply number of recipies to scrape)",
)
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args(sys.argv)
if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)
logging.getLogger("sqlalchemy.engine").setLevel(logging.INFO)
eng = db.get_engine()
S = sessionmaker(eng)
@@ -148,32 +185,34 @@ def main():
with S.begin() as sess:
site = sess.query(db.RecipeSite).where(db.RecipeSite.name == args.site).one()
site_id = site.id
recipe_ids = []
starting_id = 0
if args.id and not args.n:
recipe_ids.append(args.id)
logging.info(f'Retreiving single recipe: {args.id}')
logging.info(f"Retreiving single recipe: {args.id}")
elif args.n:
if not args.id:
last_recipe = sess.query(db.Recipe).\
where(db.Recipe.recipe_site_id == site.id).\
order_by(desc(db.Recipe.identifier)).\
limit(1).\
scalar()
last_recipe = (
sess.query(db.Recipe)
.where(db.Recipe.recipe_site_id == site.id)
.order_by(desc(db.Recipe.identifier))
.limit(1)
.scalar()
)
starting_id = int(last_recipe.identifier) + 1
else:
starting_id = int(args.id)
recipe_ids = range(starting_id, starting_id+int(args.n))
logging.info(f'Retreving {args.n} recipes from {site.base_url} starting at {starting_id}')
for recipe_id in recipe_ids:
try:
recipe_ids = range(starting_id, starting_id + int(args.n))
logging.info(
f"Retreving {args.n} recipes from {site.base_url} starting at {starting_id}"
)
for recipe_id in recipe_ids:
try:
savepoint = sess.begin_nested()
recipe = db.Recipe(identifier = recipe_id, recipe_site_id = site.id)
recipe = db.Recipe(identifier=recipe_id, recipe_site_id=site.id)
parse_recipe(sess, recipe, site)
savepoint.commit()
@@ -185,6 +224,6 @@ def main():
logging.error(e)
continue
if __name__ == "__main__": # pragma: no cover
main()
if __name__ == "__main__": # pragma: no cover
main()

View File

@@ -8,7 +8,7 @@ import sqlalchemy
import pytest
@pytest.fixture
@pytest.fixture()
def engine() -> sqlalchemy.engine.Engine:
engine = db.get_engine()
# make sure db is empty otherwise might be testing a live db

View File

@@ -1,30 +1,54 @@
import json
import os
from recipe_graph import insert_sites
from recipe_graph import insert_sites, db
from sqlalchemy import select
import sqlalchemy
import logging
import pytest
from test_db import engine, init_db
@pytest.fixture
def json_data() -> list[dict]:
def json_data() -> list[dict[str, any]]:
return [{"key": "value"}, {"test": "value1", "test2": "value2"}]
@pytest.fixture
def db_initialized(engine) -> sqlalchemy.engine.Engine:
init_db(engine)
return engine
@pytest.fixture
def mock_sites() -> list[dict[str, any]]:
return [
{
"name": "example-site",
"ingredient_class": "example-item-name",
"name_class": "example-content",
"base_url": "https://www.example.com/recipe/",
},
{
"name": "test-site",
"ingredient_class": "test-item-name",
"name_class": "test-content",
"base_url": "https://www.test.com/recipe/",
},
]
@pytest.fixture
def json_file(json_data: list[dict]) -> str:
f_path = "test.json"
with open(f_path, 'w') as f:
with open(f_path, "w") as f:
json.dump(json_data, f)
yield f_path
if os.path.exists(f_path):
os.remove(f_path)
def test_load_file(json_file: str, json_data):
test_data = insert_sites.load_file(json_file)
assert test_data == json_data
def test_setup_argparser():
file_name = "test"
@@ -33,7 +57,6 @@ def test_setup_argparser():
assert args.file == file_name
assert args.verbose == False
args = insert_sites.setup_argparser([file_name, "-v"])
assert args.file == file_name
assert args.verbose == True
@@ -42,16 +65,36 @@ def test_setup_argparser():
assert args.file == file_name
assert args.verbose == True
def test_setup_logging():
args = insert_sites.setup_argparser(["test"])
logger = insert_sites.setup_logging(args)
assert logger.level == logging.WARNING
args = insert_sites.setup_argparser(["test", "-v"])
logger = insert_sites.setup_logging(args)
logger = insert_sites.setup_logging(args)
assert logger.level == logging.INFO
args = insert_sites.setup_argparser(["test", "--verbose"])
logger = insert_sites.setup_logging(args)
logger = insert_sites.setup_logging(args)
assert logger.level == logging.INFO
def test_add_sites(mock_sites, db_initialized):
db_session = db.get_session()
insert_sites.add_sites(db_session, mock_sites)
results = []
with db_session.begin() as session:
results = session.execute(select(db.RecipeSite)).all()
assert len(results) > 0
assert len(results) == 2
print(db.RecipeSite(name="a"))
for i, (site,) in enumerate(results):
site.name == mock_sites[i]["name"]
site.ingredient_class == mock_sites[i]["ingredient_class"]
site.name_class == mock_sites[i]["name_class"]
site.base_url == mock_sites[i]["base_url"]

View File

@@ -1,3 +1,115 @@
from recipe_graph import scrape
from bs4 import BeautifulSoup
from recipe_graph.db import RecipeSite, Recipe, RecipeIngredient, RecipeIngredientParts
from pytest import fixture
@fixture
def mock_site():
return RecipeSite(
name="mock-site",
ingredient_class="mock-ing",
name_class="mock-name",
base_url="example-site/mock-site",
)
# TODO: should probably load HTML from file
@fixture
def mock_page():
return BeautifulSoup(
"""
<header></header><body>
<div class="mock-name">test_recipe</div>
<div class="mock-ing">test_ingredient</div>
</body>
""",
"html.parser",
)
@fixture
def mock_blank_page():
return BeautifulSoup(""" <header></header><body> </body> """, "html.parser")
@fixture
def mock_recipe():
return Recipe(name="test_recipe", identifier="mock_1")
@fixture
def mock_ingredient():
return RecipeIngredient(text="1 ounce water")
@fixture
def mock_url():
return "example-site/mock-site"
def test_load_page():
page = scrape.load_page("https://www.google.com")
assert type(page) == BeautifulSoup
page = scrape.load_page("https://www.google.com/some-nonsense")
assert page == None
def test_ingredient_regex():
regex = scrape.ingredient_regex(["cup"], ["crushed"])
assert (
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up)e?s?)?)((?:(?:(?:[cC]rushed)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)
regex = scrape.ingredient_regex(["cup", "ounce"], ["crushed", "ground"])
assert (
regex.pattern
== "((?:[\\d\\./\\u00BC-\\u00BE\\u2150-\\u215E]*\\s?(?:\\(.+\\))?)*)((?:(?:[cC]up|[oO]unce)e?s?)?)((?:(?:(?:[cC]rushed|[gG]round)(?:ly)?)| )*)([a-zA-Z '\\-]+),?(.*)"
)
def test_parse_ingredient(mock_ingredient):
parts = scrape.parse_ingredient(mock_ingredient.text)
assert len(parts) > 0
assert parts == ['1', 'ounce', '', 'water', None]
parts = scrape.parse_ingredient("Water")
assert len(parts) > 0
assert parts == [None, None, None, 'Water', None]
parts = scrape.parse_ingredient("")
assert parts == None
def test_parse_recipe_name(mock_site, mock_page, mock_recipe, mock_url, mock_blank_page,):
expected_name = mock_recipe.name
mock_recipe.name = None
mock_recipe = scrape.parse_recipe_name(
mock_site,
mock_page,
mock_recipe,
)
assert mock_recipe.name == expected_name
ex = None
try:
mock_recipe = scrape.parse_recipe_name(
mock_site,
mock_blank_page,
mock_recipe,
)
except Exception as e:
ex = e
url = {"site": mock_site.base_url, "recipe": mock_recipe.identifier}
assert str(e) == f"Could not extract recipe name: {url}"
assert ex
def test_ingredient_to_parts(mock_ingredient):
parts = scrape.ingredient_to_parts(mock_ingredient)
assert parts.quantity == "1"
assert parts.unit == "ounce"
assert parts.instruction == ""
assert parts.ingredient == "water"
assert parts.supplement == None
import pytest