optimize parsing

This commit is contained in:
vincent 2019-05-09 19:12:41 +02:00
parent db3001be7e
commit 2991eb2f59
2 changed files with 50 additions and 38 deletions

View File

@ -1,11 +1,13 @@
from flask import Blueprint, jsonify, request,make_response,redirect,url_for,render_template,current_app
from .Jsonfile import JSONfile
from . import emission
from .emission import Emmission
import jwt
from functools import wraps
from datetime import datetime, timedelta
from .user import User
data= JSONfile("chaine.json")
emmission= Emmission()
def token_required(f):
@wraps(f)
@ -68,7 +70,8 @@ def get_emmission(num):
if (chaine == "numero de chaine inconnue"):
return make_response("",204)
else:
return jsonify(emission.parse_emmission(chaine))
return jsonify(emmission.parse_emmission(chaine))
#@api.route('/register/', methods=('POST',))
#def register():

View File

@ -1,46 +1,55 @@
from bs4 import BeautifulSoup
import urllib.request
import re
from datetime import datetime, timedelta
#debug
#import pprint
def parse_emmission(strsearch):
URL="https://www.programme-tv.net/programme/canal-5/"
try:
response = urllib.request.urlopen(URL)
except urllib.error.URLError:
return False
html = response.read()
parse=BeautifulSoup(html,"html.parser")
strsearch=strsearch.replace('É','E')
linkchaine=parse.find(text=re.compile(re.escape(strsearch)))
if linkchaine == None:
return "can't find channel"
link=linkchaine.parent.parent.find_next_sibling().find("a")
href=link['href']
response = urllib.request.urlopen(href)
html = response.read()
parse=BeautifulSoup(html,"html.parser")
divcasting=parse.select_one(".descriptif")
if (divcasting):
casting=divcasting.find_all(href=re.compile("biographie"))
count=0
for actor in casting:
casting[count]=actor.text
count+=1
else:
casting= None
divsynopsis=parse.select_one(".episode-synopsis")
if (divsynopsis):
img=divsynopsis.find_next('img')['data-src']
synopsis=divsynopsis.select_one(".d-b").text
else:
img=None
synopsis=""
return {'title':link['title'],'href':href,'casting':casting,'synopsis':remove_first_space(synopsis),'img':img}
class Emmission(object):
def __init__(self):
self._LoadreferencePage()
def _LoadreferencePage(self):
URL="https://www.programme-tv.net/programme/canal-5/"
try:
response = urllib.request.urlopen(URL)
except urllib.error.URLError:
return None
print("load")
self.html = BeautifulSoup(response.read(),"html.parser")
self.timestamp=datetime.utcnow()
def parse_emmission(self,strsearch):
if (self.timestamp > self.timestamp+timedelta(minutes=5)):
self._LoadreferencePage()
strsearch=strsearch.replace('É','E')
linkchaine=self.html.find(text=re.compile(re.escape(strsearch)))
if linkchaine == None:
return "can't find channel"
link = linkchaine.parent.parent.find_next_sibling().find("a")
href = link['href']
response = urllib.request.urlopen(href)
parse=BeautifulSoup(response.read(),"html.parser")
divcasting=parse.select_one(".descriptif")
if (divcasting):
casting=divcasting.find_all(href=re.compile("biographie"))
count=0
for actor in casting:
casting[count]=actor.text
count+=1
else:
casting= None
divsynopsis=parse.select_one(".episode-synopsis")
if (divsynopsis):
img=divsynopsis.find_next('img')['data-src']
synopsis=divsynopsis.select_one(".d-b").text
else:
img=None
synopsis=""
return {'title':link['title'],'href':href,'casting':casting,'synopsis':remove_first_space(synopsis),'img':img}
def remove_first_space (string):