optimize parsing

This commit is contained in:
vincent 2019-05-09 19:12:41 +02:00
parent db3001be7e
commit 2991eb2f59
2 changed files with 50 additions and 38 deletions

View File

@ -1,11 +1,13 @@
from flask import Blueprint, jsonify, request,make_response,redirect,url_for,render_template,current_app from flask import Blueprint, jsonify, request,make_response,redirect,url_for,render_template,current_app
from .Jsonfile import JSONfile from .Jsonfile import JSONfile
from . import emission from .emission import Emmission
import jwt import jwt
from functools import wraps from functools import wraps
from datetime import datetime, timedelta from datetime import datetime, timedelta
from .user import User from .user import User
data= JSONfile("chaine.json") data= JSONfile("chaine.json")
emmission= Emmission()
def token_required(f): def token_required(f):
@wraps(f) @wraps(f)
@ -68,7 +70,8 @@ def get_emmission(num):
if (chaine == "numero de chaine inconnue"): if (chaine == "numero de chaine inconnue"):
return make_response("",204) return make_response("",204)
else: else:
return jsonify(emission.parse_emmission(chaine))
return jsonify(emmission.parse_emmission(chaine))
#@api.route('/register/', methods=('POST',)) #@api.route('/register/', methods=('POST',))
#def register(): #def register():

View File

@ -1,46 +1,55 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import urllib.request import urllib.request
import re import re
from datetime import datetime, timedelta
#debug #debug
#import pprint #import pprint
def parse_emmission(strsearch): class Emmission(object):
URL="https://www.programme-tv.net/programme/canal-5/"
try: def __init__(self):
response = urllib.request.urlopen(URL) self._LoadreferencePage()
except urllib.error.URLError:
return False def _LoadreferencePage(self):
URL="https://www.programme-tv.net/programme/canal-5/"
html = response.read() try:
parse=BeautifulSoup(html,"html.parser") response = urllib.request.urlopen(URL)
strsearch=strsearch.replace('É','E') except urllib.error.URLError:
linkchaine=parse.find(text=re.compile(re.escape(strsearch))) return None
if linkchaine == None: print("load")
return "can't find channel" self.html = BeautifulSoup(response.read(),"html.parser")
link=linkchaine.parent.parent.find_next_sibling().find("a") self.timestamp=datetime.utcnow()
href=link['href']
response = urllib.request.urlopen(href) def parse_emmission(self,strsearch):
html = response.read() if (self.timestamp > self.timestamp+timedelta(minutes=5)):
parse=BeautifulSoup(html,"html.parser") self._LoadreferencePage()
divcasting=parse.select_one(".descriptif") strsearch=strsearch.replace('É','E')
if (divcasting): linkchaine=self.html.find(text=re.compile(re.escape(strsearch)))
casting=divcasting.find_all(href=re.compile("biographie")) if linkchaine == None:
count=0 return "can't find channel"
for actor in casting: link = linkchaine.parent.parent.find_next_sibling().find("a")
casting[count]=actor.text href = link['href']
count+=1 response = urllib.request.urlopen(href)
else: parse=BeautifulSoup(response.read(),"html.parser")
casting= None divcasting=parse.select_one(".descriptif")
divsynopsis=parse.select_one(".episode-synopsis") if (divcasting):
if (divsynopsis): casting=divcasting.find_all(href=re.compile("biographie"))
img=divsynopsis.find_next('img')['data-src'] count=0
synopsis=divsynopsis.select_one(".d-b").text for actor in casting:
else: casting[count]=actor.text
img=None count+=1
synopsis="" else:
casting= None
return {'title':link['title'],'href':href,'casting':casting,'synopsis':remove_first_space(synopsis),'img':img} divsynopsis=parse.select_one(".episode-synopsis")
if (divsynopsis):
img=divsynopsis.find_next('img')['data-src']
synopsis=divsynopsis.select_one(".d-b").text
else:
img=None
synopsis=""
return {'title':link['title'],'href':href,'casting':casting,'synopsis':remove_first_space(synopsis),'img':img}
def remove_first_space (string): def remove_first_space (string):