from bs4 import BeautifulSoup import urllib.request import re from datetime import datetime, timedelta from time import sleep #debug #import pprint class Emmission(object): loading = False def __init__(self): self._LoadreferencePage() def _LoadreferencePage(self): URL="https://www.programme-tv.net/programme/canal-5/" try: response = urllib.request.urlopen(URL) except urllib.error.URLError: return None print("load") self.html = BeautifulSoup(response.read(),"html.parser") self.timeexp=datetime.utcnow() +timedelta(seconds=30) def parse_emmission(self,strsearch): if ((datetime.utcnow() > self.timeexp) and (self.loading == False)): self.loading = True self._LoadreferencePage() self.loading = False else: while(self.loading): sleep(0.1) pass strsearch=strsearch.replace('É','E') strsearch=strsearch.strip() print(strsearch) chaineElement=self.html.find(string=re.compile(re.escape(strsearch))) if chaineElement == None: strsearch=strsearch.replace(" ","") chaineElement=self.html.find(string=re.compile(re.escape(strsearch))) if chaineElement == None: return "can't find channel" emissionElement=chaineElement.parent.parent.parent.find_next_sibling() print(emissionElement) link = emissionElement.find("a") href = link['href'] try: img=emissionElement.find_next('img')['data-src'] except KeyError: img=emissionElement.find_next('img')['src'] response = urllib.request.urlopen(href) parse=BeautifulSoup(response.read(),"html.parser") divcasting=parse.select_one(".peopleList") if (divcasting): casting=divcasting.find_all(href=re.compile("\/biographie.*")) count=0 for actor in casting: casting[count]=actor['title'] count+=1 else: casting= None divsynopsis=parse.select_one(".synopsis") if (divsynopsis): synopsis=divsynopsis.text else: img=None synopsis="" return {'title':link['title'],'href':href,'casting':casting,'synopsis':remove_first_space(synopsis),'img':img} def remove_first_space (string): space_number=0 for char in string: if char.isspace(): space_number+=1 else: break return string[space_number:]