chainetv/parserHTML.py

70 lines
2.4 KiB
Python
Raw Normal View History

2018-08-17 18:31:23 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import json
import re
2018-08-17 18:31:23 +00:00
def RepresentsInt(s):
try:
int(s)
return True
except ValueError:
return False
except TypeError:
return False
def parsechaine():
URL='https://fr.wikipedia.org/wiki/Liste_des_cha%C3%AEnes_de_Canal'
liste_chaine={}
response = urllib.request.urlopen(URL)
html = response.read()
parse=BeautifulSoup(html,"html.parser")
for item in parse.find_all('table'):
if (item.get("class") == ['wikitable'] or item.get("class") == ['wikitable', 'sortable'] ):
for tr in item.find_all('tr'):
firstTD=tr.find()
num=firstTD.string
if RepresentsInt(num):
#print (num)
if RepresentsInt(firstTD.find_next().string):
#print (firstTD.find_next().find_next().string)
liste_chaine[num]=firstTD.find_next().find_next().string
else:
#print (firstTD.find_next().string)
liste_chaine[num]=firstTD.find_next().string
print(json.dumps(liste_chaine, indent=4))
with open('chaine.json', 'w', encoding='utf-8') as f:
json.dump(liste_chaine, f, indent=4)
def load_jsonfile(file):
with open(file, 'r', encoding='utf-8') as f:
return json.load(f)
def parse_emmission(URL):
response = urllib.request.urlopen(URL)
html = response.read()
parse=BeautifulSoup(html,"html.parser")
link=parse.select_one(".prog_name")
response = urllib.request.urlopen(("https://www.programme-tv.net"+link['href']))
html = response.read()
parse=BeautifulSoup(html,"html.parser")
divcasting=parse.select_one(".descriptif")
casting=divcasting.find_all(href=re.compile("biographie"))
i=0
for actor in casting:
casting[i]=actor.text
i+=1
divsynopsis=parse.select_one(".episode-synopsis")
img=divsynopsis.find_next('img')['data-src']
synopsis=divsynopsis.select_one(".d-b").text
return {'title':link['title'],'href':("https://www.programme-tv.net"+link['href']),'casting':casting,'synopsis':synopsis,'img':img}
print(parse_emmission("https://www.programme-tv.net/rechercher?q=France+3"))
#parsechaine()
#data=load_jsonfile('chaine.json')
#print(data["0"])