from bs4 import BeautifulSoup import urllib.request import re from datetime import datetime, timedelta #debug #import pprint class Emmission(object): def __init__(self): self._LoadreferencePage() def _LoadreferencePage(self): URL="https://www.programme-tv.net/programme/canal-5/" try: response = urllib.request.urlopen(URL) except urllib.error.URLError: return None print("load") self.html = BeautifulSoup(response.read(),"html.parser") self.timeexp=datetime.utcnow() +timedelta(minutes=5) def parse_emmission(self,strsearch): if (datetime.utcnow() > self.timeexp): self._LoadreferencePage() strsearch=strsearch.replace('É','E') linkchaine=self.html.find(text=re.compile(re.escape(strsearch))) if linkchaine == None: return "can't find channel" link = linkchaine.parent.parent.find_next_sibling().find("a") href = link['href'] response = urllib.request.urlopen(href) parse=BeautifulSoup(response.read(),"html.parser") divcasting=parse.select_one(".descriptif") if (divcasting): casting=divcasting.find_all(href=re.compile("biographie")) count=0 for actor in casting: casting[count]=actor.text count+=1 else: casting= None divsynopsis=parse.select_one(".episode-synopsis") if (divsynopsis): img=divsynopsis.find_next('img')['data-src'] synopsis=divsynopsis.select_one(".d-b").text else: img=None synopsis="" return {'title':link['title'],'href':href,'casting':casting,'synopsis':remove_first_space(synopsis),'img':img} def remove_first_space (string): space_number=0 for char in string: if char.isspace(): space_number+=1 else: break return string[space_number:]