chainetv/parserHTML.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
import json
import re

def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False
    except TypeError:
        return False
def parsechaine():
    URL='https://fr.wikipedia.org/wiki/Liste_des_cha%C3%AEnes_de_Canal'
    liste_chaine={} 
    response = urllib.request.urlopen(URL)
    html = response.read()
    parse=BeautifulSoup(html,"html.parser")
    for item in parse.find_all('table'):
        if (item.get("class") == ['wikitable'] or item.get("class") == ['wikitable', 'sortable'] ):
            for tr in item.find_all('tr'):
            
                firstTD=tr.find()
                num=firstTD.string
                if RepresentsInt(num):
                    #print (num)              
                    if RepresentsInt(firstTD.find_next().string):
                       #print (firstTD.find_next().find_next().string)
                       liste_chaine[num]=firstTD.find_next().find_next().string
                    else:
                        #print (firstTD.find_next().string)
                        liste_chaine[num]=firstTD.find_next().string
    print(json.dumps(liste_chaine, indent=4))
    with open('chaine.json', 'w', encoding='utf-8') as f:
        json.dump(liste_chaine, f, indent=4)

def load_jsonfile(file):
    with open(file, 'r', encoding='utf-8') as f:
        return json.load(f)    


def parse_emmission(URL):
    response = urllib.request.urlopen(URL)
    html = response.read()
    parse=BeautifulSoup(html,"html.parser")
    link=parse.select_one(".prog_name")
    response = urllib.request.urlopen(("https://www.programme-tv.net"+link['href']))
    html = response.read()
    parse=BeautifulSoup(html,"html.parser")
    divcasting=parse.select_one(".descriptif")
    casting=divcasting.find_all(href=re.compile("biographie"))
    i=0
    for actor in casting:
        casting[i]=actor.text
        i+=1
    divsynopsis=parse.select_one(".episode-synopsis")
    img=divsynopsis.find_next('img')['data-src']
    synopsis=divsynopsis.select_one(".d-b").text

    return {'title':link['title'],'href':("https://www.programme-tv.net"+link['href']),'casting':casting,'synopsis':synopsis,'img':img}


print(parse_emmission("https://www.programme-tv.net/rechercher?q=France+3"))
#parsechaine()
#data=load_jsonfile('chaine.json')
#print(data["0"])
first commit split python project 2018-08-17 18:31:23 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`

			`from bs4 import BeautifulSoup`
			`import urllib.request`
			`import json`
ajout titre synopsys et casting a la cli 2018-08-18 17:03:27 +00:00			`import re`
first commit split python project 2018-08-17 18:31:23 +00:00
			`def RepresentsInt(s):`
			`try:`
			`int(s)`
			`return True`
			`except ValueError:`
			`return False`
			`except TypeError:`
			`return False`
			`def parsechaine():`
			`URL='https://fr.wikipedia.org/wiki/Liste_des_cha%C3%AEnes_de_Canal'`
			`liste_chaine={}`
			`response = urllib.request.urlopen(URL)`
			`html = response.read()`
			`parse=BeautifulSoup(html,"html.parser")`
			`for item in parse.find_all('table'):`
			`if (item.get("class") == ['wikitable'] or item.get("class") == ['wikitable', 'sortable'] ):`
			`for tr in item.find_all('tr'):`

			`firstTD=tr.find()`
			`num=firstTD.string`
			`if RepresentsInt(num):`
			`#print (num)`
			`if RepresentsInt(firstTD.find_next().string):`
			`#print (firstTD.find_next().find_next().string)`
			`liste_chaine[num]=firstTD.find_next().find_next().string`
			`else:`
			`#print (firstTD.find_next().string)`
			`liste_chaine[num]=firstTD.find_next().string`
			`print(json.dumps(liste_chaine, indent=4))`
			`with open('chaine.json', 'w', encoding='utf-8') as f:`
			`json.dump(liste_chaine, f, indent=4)`

			`def load_jsonfile(file):`
			`with open(file, 'r', encoding='utf-8') as f:`
			`return json.load(f)`

création parser function pour récupérer le lien vers l'émission du soir 2018-08-18 14:44:04 +00:00
			`def parse_emmission(URL):`
			`response = urllib.request.urlopen(URL)`
			`html = response.read()`
			`parse=BeautifulSoup(html,"html.parser")`
			`link=parse.select_one(".prog_name")`
ajout titre synopsys et casting a la cli 2018-08-18 17:03:27 +00:00			`response = urllib.request.urlopen(("https://www.programme-tv.net"+link['href']))`
			`html = response.read()`
			`parse=BeautifulSoup(html,"html.parser")`
			`divcasting=parse.select_one(".descriptif")`
			`casting=divcasting.find_all(href=re.compile("biographie"))`
			`i=0`
			`for actor in casting:`
			`casting[i]=actor.text`
			`i+=1`
			`divsynopsis=parse.select_one(".episode-synopsis")`
			`img=divsynopsis.find_next('img')['data-src']`
			`synopsis=divsynopsis.select_one(".d-b").text`

			`return {'title':link['title'],'href':("https://www.programme-tv.net"+link['href']),'casting':casting,'synopsis':synopsis,'img':img}`
création parser function pour récupérer le lien vers l'émission du soir 2018-08-18 14:44:04 +00:00

			`print(parse_emmission("https://www.programme-tv.net/rechercher?q=France+3"))`
			`#parsechaine()`
			`#data=load_jsonfile('chaine.json')`
			`#print(data["0"])`