Update parser, Implement logging

This commit is contained in:
michigg 2018-03-22 17:22:42 +01:00
parent 0be7701451
commit 4bda3fa5fb
9 changed files with 177 additions and 154 deletions

View File

@ -3,9 +3,11 @@ ADD ["ofu_app/requirements.txt", "/requirements.txt"]
RUN apk upgrade --update && \ RUN apk upgrade --update && \
apk add --update python3 py3-pillow py3-lxml py3-psycopg2 && \ apk add --update python3 py3-pillow py3-lxml py3-psycopg2 && \
pip3 install -r /requirements.txt && rm /requirements.txt pip3 install -r /requirements.txt && rm /requirements.txt
EXPOSE 80
WORKDIR /app WORKDIR /app
VOLUME ["/app"] VOLUME ["/app"]
VOLUME ["/app/data"] VOLUME ["/app/data"]
VOLUME ["/app/media"] VOLUME ["/app/media"]
VOLUME ["/app/log"]
ENTRYPOINT ["python3", "manage.py"] ENTRYPOINT ["python3", "manage.py"]
CMD ["runserver", "0.0.0.0:80"] CMD ["runserver", "0.0.0.0:80"]

View File

@ -9,6 +9,7 @@ services:
volumes: volumes:
- ./data/data:/data - ./data/data:/data
- ./data/media:/media - ./data/media:/media
- ./log:/log
- ./ofu_app/:/app - ./ofu_app/:/app
env_file: env_file:
- docker.env - docker.env

View File

@ -1,5 +1,5 @@
from django.core.management.base import BaseCommand, CommandError from django.core.management.base import BaseCommand
from apps.food.models import Menu, HappyHour, SingleFood
from apps.food.utils import migrate_data from apps.food.utils import migrate_data

View File

@ -4,6 +4,9 @@ from pprint import pprint
from django.db.utils import IntegrityError from django.db.utils import IntegrityError
from apps.food.models import SingleFood, Menu, HappyHour, Allergene from apps.food.models import SingleFood, Menu, HappyHour, Allergene
from apps.food.utils.parser import mensa_page_parser, fekide_happyhour_page_parser, cafete_page_parser from apps.food.utils.parser import mensa_page_parser, fekide_happyhour_page_parser, cafete_page_parser
import logging
logger = logging.getLogger(__name__)
# CONFIG SERVICE LINKS # CONFIG SERVICE LINKS
LINK_FEKI_MENSA = "https://www.studentenwerk-wuerzburg.de/bamberg/essen-trinken/speiseplaene.html?tx_thmensamenu_pi2%5Bmensen%5D=3&tx_thmensamenu_pi2%5Baction%5D=show&tx_thmensamenu_pi2%5Bcontroller%5D=Speiseplan&cHash=c3fe5ebb35e5fba3794f01878e798b7c" LINK_FEKI_MENSA = "https://www.studentenwerk-wuerzburg.de/bamberg/essen-trinken/speiseplaene.html?tx_thmensamenu_pi2%5Bmensen%5D=3&tx_thmensamenu_pi2%5Baction%5D=show&tx_thmensamenu_pi2%5Bcontroller%5D=Speiseplan&cHash=c3fe5ebb35e5fba3794f01878e798b7c"
@ -15,79 +18,58 @@ LINK_FEKIDE_GUIDE = "https://www.feki.de/happyhour"
LOCATION_NAMES = ('erba', 'markusplatz', 'feldkirchenstraße', 'austraße') LOCATION_NAMES = ('erba', 'markusplatz', 'feldkirchenstraße', 'austraße')
def getJsonFromFile(path):
with open(path, "r") as file:
return json.load(file)
def getLocation(raw_loc): def getLocation(raw_loc):
for choice, name in zip(Menu.LOCATION_CHOICES, LOCATION_NAMES): for choice, name in zip(Menu.LOCATION_CHOICES, LOCATION_NAMES):
print(name.upper() in str(raw_loc).upper()) if name.upper() in str(raw_loc).upper():
if (name.upper() in str(raw_loc).upper()):
return choice return choice
logger.warning("{loc} unknown location".format(loc=raw_loc))
print("LOCATION NOT FOUND") return None
def writeStudentenwerkDataInDB(data): def writeStudentenwerkDataInDB(data):
data = json.loads(data) if not data:
pprint(data) logger.warning('no data')
return
logger.info("{location}".format(location=data['name']))
for menu in data['weekmenu']: for menu in data['weekmenu']:
pprint(menu) logger.info("{date}".format(date=menu['date']))
foodlist = [] foodlist = []
for single_food in menu['menu']: for single_food in menu['menu']:
pprint(single_food) logger.info("{}".format(single_food['title']))
allergens = []
if 'allergens' in single_food: if 'allergens' in single_food:
allergens = []
for allergen in single_food['allergens']: for allergen in single_food['allergens']:
try: allergens.append(Allergene.objects.get_or_create(name=allergen)[0])
allergens.append(Allergene.objects.create(name=allergen)) # TODO: Consider keyword arg for price
except IntegrityError:
allergens.append(Allergene.objects.get(name=allergen))
try: try:
if 'prices' in single_food: db_single_food, created = SingleFood.objects.get_or_create(name=single_food['title'])
if 'price_student' in single_food['prices']:
price_student = single_food['prices']['price_student']
else:
price_student = "None"
if 'price_employee' in single_food['prices']:
price_employee = single_food['prices']['price_employee']
else:
price_employee = "None"
if 'price_guest' in single_food['prices']:
price_guest = single_food['prices']['price_guest']
else:
price_guest = "None"
db_single_food = SingleFood.objects.create(name=single_food['title'],
price_student=price_student,
price_employee=price_employee,
price_guest=price_guest)
else:
db_single_food = SingleFood.objects.create(name=single_food['title'])
if 'allergens' in locals():
db_single_food.allergens.set(allergens)
foodlist.append(db_single_food)
except IntegrityError:
db_single_food = SingleFood.objects.get(name=single_food['title'])
if 'prices' in single_food: if 'prices' in single_food:
if 'price_student' in single_food['prices']: if 'price_student' in single_food['prices']:
db_single_food.price_student = single_food['prices']['price_student'] db_single_food.price_student = single_food['prices']['price_student']
else:
db_single_food.price_student = "None"
if 'price_employee' in single_food['prices']: if 'price_employee' in single_food['prices']:
db_single_food.price_employee = single_food['prices']['price_employee'] db_single_food.price_employee = single_food['prices']['price_employee']
else:
db_single_food.price_employee = "None"
if 'price_guest' in single_food['prices']: if 'price_guest' in single_food['prices']:
db_single_food.price_guest = single_food['prices']['price_guest'] db_single_food.price_guest = single_food['prices']['price_guest']
if 'allergens' in locals(): else:
db_single_food.price_guest = "None"
if allergens:
db_single_food.allergens.set(allergens) db_single_food.allergens.set(allergens)
foodlist.append(db_single_food) foodlist.append(db_single_food)
try: db_single_food.save()
except IntegrityError as e:
logger.exception(e)
try:
date = datetime.strptime(str(menu['date']), "%d.%m.").replace(year=datetime.today().year) date = datetime.strptime(str(menu['date']), "%d.%m.").replace(year=datetime.today().year)
menu = Menu.objects.create(location=getLocation(data['name']), date=date) menu, _ = Menu.objects.get_or_create(location=getLocation(data['name']), date=date)
menu.menu.set(foodlist) menu.menu.set(foodlist)
menu.save() menu.save()
except IntegrityError as error: except IntegrityError as error:
# ignored logger.exception(error)
pass
def writeFekideDataInDB(data): def writeFekideDataInDB(data):
@ -106,14 +88,16 @@ def writeFekideDataInDB(data):
happyhour.endtime = datetime.strptime(time[1], "%H:%M").time() happyhour.endtime = datetime.strptime(time[1], "%H:%M").time()
happyhour.save() happyhour.save()
print("%s: Happy Hour: Location: %s, Description: %s" % ( logger.info("%s: Happy Hour: Location: %s, Description: %s",
str(happyhour.date.date()), str(happyhour.location), str(happyhour.description))) str(happyhour.date.date()), str(happyhour.location), str(happyhour.description))
def writeoutDBObjects(): def writeoutDBObjects():
pprint("SingleFood: " + str(SingleFood.objects.count())) return "\n\tSingleFood: {single_food}\n\tMenu: {menu}\n\tHappyHour: {happy_hour}".format(
pprint("Menu: " + str(Menu.objects.count())) single_food=SingleFood.objects.count(),
pprint("HappyHour: " + str(HappyHour.objects.count())) menu=Menu.objects.count(),
happy_hour=HappyHour.objects.count()
)
def delete(): def delete():
@ -126,17 +110,16 @@ def delete():
def main(): def main():
print("Aktueller Stand:") logger.info("Aktueller Stand:" + writeoutDBObjects())
writeoutDBObjects()
# get food jsons # get food jsons
writeStudentenwerkDataInDB(mensa_page_parser.parsePage(LINK_AUSTR_MENSA)) writeStudentenwerkDataInDB(mensa_page_parser.parsePage(LINK_AUSTR_MENSA))
writeStudentenwerkDataInDB(mensa_page_parser.parsePage(LINK_FEKI_MENSA)) writeStudentenwerkDataInDB(mensa_page_parser.parsePage(LINK_FEKI_MENSA))
writeStudentenwerkDataInDB(cafete_page_parser.parsePage(LINK_ERBA_CAFETE)) writeStudentenwerkDataInDB(cafete_page_parser.parse_page(LINK_ERBA_CAFETE))
writeStudentenwerkDataInDB(cafete_page_parser.parsePage(LINK_MARKUS_CAFETE)) writeStudentenwerkDataInDB(cafete_page_parser.parse_page(LINK_MARKUS_CAFETE))
writeFekideDataInDB(fekide_happyhour_page_parser.parsePage(LINK_FEKIDE_GUIDE)) writeFekideDataInDB(fekide_happyhour_page_parser.parse_page(LINK_FEKIDE_GUIDE))
print("Neuer Stand:") logger.info("Neuer Stand:" + writeoutDBObjects())
writeoutDBObjects()
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -0,0 +1,8 @@
import requests
def load_page(url: str):
response = requests.get(url)
if not response.ok:
raise ConnectionError("Response not ok", response.status_code, url)
return response.content

View File

@ -1,23 +1,22 @@
import requests
from bs4 import BeautifulSoup
import json
import datetime import datetime
import logging
import re import re
from pprint import pprint
from bs4 import BeautifulSoup
from . import load_page
logger = logging.getLogger(__name__)
SPEISEPLAN_NAME_SELECTOR = '.csc-default .csc-header .csc-firstHeader' SPEISEPLAN_NAME_SELECTOR = '.csc-default .csc-header .csc-firstHeader'
def loadPage(url: str): def get_foodplan_name(soup):
return requests.get(url).content
def getFoodplanName(soup):
foodplan_name = soup.select(SPEISEPLAN_NAME_SELECTOR)[0].getText() foodplan_name = soup.select(SPEISEPLAN_NAME_SELECTOR)[0].getText()
return foodplan_name return foodplan_name
def getRightLine(lines): def get_right_line(lines):
foodlines = [] foodlines = []
pattern = re.compile("[0-9]+.+[A-Z]+") pattern = re.compile("[0-9]+.+[A-Z]+")
for line in list(lines): for line in list(lines):
@ -27,42 +26,42 @@ def getRightLine(lines):
return foodlines return foodlines
def getFoodPerDay(soup): def get_food_per_day(soup):
days = [] days = []
lines = soup.select('.csc-default .bodytext') lines = soup.select('.csc-default .bodytext')
foodlines = getRightLine(lines) foodlines = get_right_line(lines)
for food in foodlines: for food in foodlines:
dayObj = {}
day = str(food).split()[0] day = str(food).split()[0]
foodName = str(food).replace(day, "").strip() food_name = str(food).replace(day, "").strip()
singleFoodObj = {} single_food_obj = {'title': food_name}
singleFoodObj['title'] = foodName day_obj = {
dayObj['date'] = day 'date': day,
dayObj['menu'] = [singleFoodObj] 'menu': [single_food_obj]
days.append(dayObj) }
days.append(day_obj)
return days return days
def parsePage(url: str): def parse_page(url: str):
pagecontent = {} pagecontent = {}
# {mensaspeiseplan: # {mensaspeiseplan:
# {name:"", # {name:"",
# weekmenu: [day:{date:, menu:[,,,]}] # weekmenu: [day:{date:, menu:[,,,]}]
# } # }
# } # }
try:
page = load_page(url)
soup = BeautifulSoup(page, "lxml")
foodplan_name = get_foodplan_name(soup)
page = loadPage(url) days = get_food_per_day(soup)
mensaSpeiseplan = {} return {
soup = BeautifulSoup(page, "lxml") 'weekmenu': days,
foodplan_name = getFoodplanName(soup) 'name': foodplan_name,
'execution_time': datetime.datetime.today().strftime("%A, %d.%m.%Y")
days = getFoodPerDay(soup) }
mensaSpeiseplan['weekmenu'] = days except Exception as e:
mensaSpeiseplan['name'] = foodplan_name logger.exception(e)
mensaSpeiseplan['execution_time'] = datetime.datetime.today().strftime("%A, %d.%m.%Y") return None
mensaSpeiseplanJson = json.dumps(mensaSpeiseplan)
return mensaSpeiseplanJson
# LINK_ERBA_CAFETE = "https://www.studentenwerk-wuerzburg.de/bamberg/essen-trinken/sonderspeiseplaene/cafeteria-erba-insel.html" # LINK_ERBA_CAFETE = "https://www.studentenwerk-wuerzburg.de/bamberg/essen-trinken/sonderspeiseplaene/cafeteria-erba-insel.html"
# pprint(parsePage(LINK_ERBA_CAFETE))

View File

@ -1,51 +1,53 @@
import requests
from bs4 import BeautifulSoup
import datetime import datetime
import json import logging
from bs4 import BeautifulSoup
from . import load_page
logger = logging.getLogger(__name__)
SPEISEPLAN_NAME_SELECTOR = '.csc-default .csc-header .csc-firstHeader' SPEISEPLAN_NAME_SELECTOR = '.csc-default .csc-header .csc-firstHeader'
def loadPage(url: str): def get_day():
return requests.get(url).content
def getDay():
return datetime.datetime.today().strftime("%A, %d.%m.%Y") return datetime.datetime.today().strftime("%A, %d.%m.%Y")
def getHappyHours(soup): def get_happy_hours(soup):
happyhours = [] happyhours = []
happyhourstable = soup.select('#food .table tr') happyhourstable = soup.select('#food .table tr')
for tableline in happyhourstable: for tableline in happyhourstable:
happyhour = {}
linesoup = BeautifulSoup(str(tableline), "lxml") linesoup = BeautifulSoup(str(tableline), "lxml")
location = linesoup.find("td", {"class": "location"}).getText() location = linesoup.find("td", {"class": "location"}).getText()
time = linesoup.find("td", {"class": "time"}).getText() time = linesoup.find("td", {"class": "time"}).getText()
description = linesoup.find("td", {"class": "description"}).getText() description = linesoup.find("td", {"class": "description"}).getText()
description = str(description).strip() description = str(description).strip()
happyhour = {
happyhour['location'] = location 'location': location,
happyhour['time'] = time 'time': time,
happyhour['description'] = description 'description': description
}
happyhours.append(happyhour) happyhours.append(happyhour)
return happyhours return happyhours
def parsePage(url: str): def parse_page(url: str):
pagecontent = {}
# { # {
# happyhours:[{happyhour:{location: "",time: "",description: ""},,,,] # happyhours:[{happyhour:{location: "",time: "",description: ""},,,,]
# } # }
happyhours = [] happyhours = []
try:
page = loadPage(url) page = load_page(url)
soup = BeautifulSoup(page, "lxml") soup = BeautifulSoup(page, "lxml")
happyhours = getHappyHours(soup) happyhours = get_happy_hours(soup)
pagecontent['happyhours'] = happyhours return {
pagecontent['day'] = getDay() 'happyhours': happyhours,
pagecontent['execution_time'] = datetime.datetime.today().strftime("%A, %d.%m.%Y") 'day': get_day(),
return pagecontent 'execution_time': datetime.datetime.today().strftime("%A, %d.%m.%Y")
}
except Exception as e:
logger.exception(e)
return None
# LINK_FEKIDE_GUIDE = "https://www.feki.de/happyhour/wochenuebersicht" # LINK_FEKIDE_GUIDE = "https://www.feki.de/happyhour/wochenuebersicht"
# parsePage(LINK_FEKIDE_GUIDE) # parsePage(LINK_FEKIDE_GUIDE)

View File

@ -1,14 +1,11 @@
import requests
from bs4 import BeautifulSoup
import json
import datetime import datetime
import logging
from bs4 import BeautifulSoup
# FEKI_URL = "https://www.studentenwerk-wuerzburg.de/bamberg/essen-trinken/speiseplaene.html?tx_thmensamenu_pi2%5Bmensen%5D=3&tx_thmensamenu_pi2%5Baction%5D=show&tx_thmensamenu_pi2%5Bcontroller%5D=Speiseplan&cHash=c3fe5ebb35e5fba3794f01878e798b7c" from . import load_page
logger = logging.getLogger(__name__)
def loadPage(url: str):
return requests.get(url).content
def getMenuDay(soup): def getMenuDay(soup):
@ -18,12 +15,10 @@ def getMenuDay(soup):
def getFoodPerDay(soup): def getFoodPerDay(soup):
week_menus = [] week_menus = []
for day in soup.select('.currentweek .day'): for day in soup.select('.currentweek .day'):
menu = {}
daysoup = BeautifulSoup(str(day), "lxml") daysoup = BeautifulSoup(str(day), "lxml")
day = getMenuDay(daysoup) day = getMenuDay(daysoup)
day_menu = [] day_menu = []
for singleFood in daysoup.select('.menuwrap .menu'): for singleFood in daysoup.select('.menuwrap .menu'):
singleFoodObj = {}
singleFoodSoup = BeautifulSoup(str(singleFood), "lxml") singleFoodSoup = BeautifulSoup(str(singleFood), "lxml")
title = singleFoodSoup.find('div', {'class': 'title'}).getText() title = singleFoodSoup.find('div', {'class': 'title'}).getText()
allergens = [e.getText() for e in singleFoodSoup.select('.left .additnr .toggler ul li')] allergens = [e.getText() for e in singleFoodSoup.select('.left .additnr .toggler ul li')]
@ -34,13 +29,16 @@ def getFoodPerDay(soup):
prices['price_employee'] = singleFoodSoup.select('.price')[0]['data-bed'] prices['price_employee'] = singleFoodSoup.select('.price')[0]['data-bed']
if singleFoodSoup.select('.price'): if singleFoodSoup.select('.price'):
prices['price_guest'] = singleFoodSoup.select('.price')[0]['data-guest'] prices['price_guest'] = singleFoodSoup.select('.price')[0]['data-guest']
singleFoodObj['title'] = title single_food_obj = {
singleFoodObj['allergens'] = allergens 'title': title,
singleFoodObj['prices'] = prices 'allergens': allergens,
day_menu.append(singleFoodObj) 'prices': prices
}
menu['date'] = str(day).split(" ")[1] day_menu.append(single_food_obj)
menu['menu'] = day_menu menu = {
'date': str(day).split(" ")[1],
'menu': day_menu
}
week_menus.append(menu) week_menus.append(menu)
return week_menus return week_menus
@ -52,16 +50,19 @@ def parsePage(url: str):
# weekmenu: [day:{date:, menu:[,,,]}] # weekmenu: [day:{date:, menu:[,,,]}]
# } # }
# } # }
mensaSpeiseplan = {} try:
page = loadPage(url) page = load_page(url)
soup = BeautifulSoup(page, "lxml") soup = BeautifulSoup(page, "lxml")
foodplan_name = getFoodplanName(soup) foodplan_name = getFoodplanName(soup)
days = getFoodPerDay(soup) days = getFoodPerDay(soup)
mensaSpeiseplan['weekmenu'] = days return {
mensaSpeiseplan['name'] = foodplan_name 'weekmenu': days,
mensaSpeiseplan['execution_time'] = datetime.datetime.today().strftime("%A, %d.%m.%Y") 'name': foodplan_name,
mensaSpeiseplanJson = json.dumps(mensaSpeiseplan) 'execution_time': datetime.datetime.today().strftime("%A, %d.%m.%Y")
return mensaSpeiseplanJson }
except Exception as e:
logger.exception(e)
return None
def getFoodplanName(soup): def getFoodplanName(soup):

View File

@ -50,16 +50,15 @@ REST_FRAMEWORK = {
'DEFAULT_PERMISSION_CLASSES': [ 'DEFAULT_PERMISSION_CLASSES': [
'rest_framework.permissions.IsAuthenticated', 'rest_framework.permissions.IsAuthenticated',
], ],
'PAGE_SIZE': 10
} }
MIDDLEWARE = [ MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware', 'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware', 'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware', 'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware', 'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware', 'django.middleware.clickjacking.XFrameOptionsMiddleware',
] ]
ROOT_URLCONF = 'core.urls' ROOT_URLCONF = 'core.urls'
@ -76,7 +75,7 @@ TEMPLATES = [
}, },
{ {
'BACKEND': 'django.template.backends.django.DjangoTemplates', 'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [os.path.join(BASE_DIR,'templates')], 'DIRS': [os.path.join(BASE_DIR, 'templates')],
'APP_DIRS': True, 'APP_DIRS': True,
'OPTIONS': { 'OPTIONS': {
'context_processors': [ 'context_processors': [
@ -190,3 +189,31 @@ EMAIL_USE_TLS = True
ACCOUNT_EMAIL_UNIQUE = True ACCOUNT_EMAIL_UNIQUE = True
ACCOUNT_EMAIL_CONFIRMATION_REQUIRED = True ACCOUNT_EMAIL_CONFIRMATION_REQUIRED = True
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'default': {
'format': '%(asctime)s %(module)s [%(levelname)s]: %(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S',
}
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'formatter': 'default',
},
'file': {
'class': 'logging.FileHandler',
'filename': '/log/import_food.log',
'formatter': 'default',
}
},
'loggers': {
'apps.food.utils': {
'handlers': ['console', 'file'],
'level': os.getenv('DJANGO_LOG_LEVEL', 'DEBUG'),
},
},
}