from bs4 import BeautifulSoup import requests import re from datetime import datetime, timedelta from time import sleep # debug # import pprint class Emmission(object): loading = False def __init__(self): self._LoadreferencePage() def _LoadreferencePage(self): URL = "https://www.programme-tv.net/programme/canal-5/" response = requests.get(URL) print("load") self.html = BeautifulSoup(response.content, "html.parser") self.timeexp = datetime.utcnow() + timedelta(seconds=30) def parse_emmission(self, strsearch): if (datetime.utcnow() > self.timeexp) and (self.loading is False): self.loading = True self._LoadreferencePage() self.loading = False else: while self.loading: sleep(0.1) pass strsearch = strsearch.replace("É", "E") strsearch = strsearch.strip() print(strsearch) chaineElement = self.html.find(string=re.compile(re.escape(strsearch))) if chaineElement == None: strsearch = strsearch.replace(" ", "") chaineElement = self.html.find(string=re.compile(re.escape(strsearch))) if chaineElement == None: return "can't find channel" emissionElement = chaineElement.parent.parent.parent.find_next_sibling() print(emissionElement) link = emissionElement.find("a") href = link["href"] try: img = emissionElement.find_next("img")["data-src"] except KeyError: img = emissionElement.find_next("img")["src"] response = requests.get(href) parse = BeautifulSoup(response.content, "html.parser") divcasting = parse.select_one(".peopleList") if divcasting: casting = divcasting.find_all(href=re.compile("\/biographie.*")) count = 0 for actor in casting: casting[count] = actor["title"] count += 1 else: casting = None divsynopsis = parse.select_one(".synopsis") if divsynopsis: synopsis = divsynopsis.text else: img = None synopsis = "" return { "title": link["title"], "href": href, "casting": casting, "synopsis": remove_first_space(synopsis), "img": img, } def remove_first_space(string): space_number = 0 for char in string: if char.isspace(): space_number += 1 else: break return string[space_number:]