chainetv/parserHTML.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
import json

def RepresentsInt(s):
    try:
        int(s)
        return True
    except ValueError:
        return False
    except TypeError:
        return False
def parsechaine():
    URL='https://fr.wikipedia.org/wiki/Liste_des_cha%C3%AEnes_de_Canal'
    liste_chaine={}
    response = urllib.request.urlopen(URL)
    html = response.read()
    parse=BeautifulSoup(html,"html.parser")
    for item in parse.find_all('table'):
        if (item.get("class") == ['wikitable'] or item.get("class") == ['wikitable', 'sortable'] ):
            for tr in item.find_all('tr'):

                firstTD=tr.find()
                num=firstTD.string
                if RepresentsInt(num):
                    #print (num)
                    if RepresentsInt(firstTD.find_next().string):
                       #print (firstTD.find_next().find_next().string)
                       liste_chaine[num]=firstTD.find_next().find_next().string
                    else:
                        #print (firstTD.find_next().string)
                        liste_chaine[num]=firstTD.find_next().string
    print(json.dumps(liste_chaine, indent=4))
    with open('chaine.json', 'w', encoding='utf-8') as f:
        json.dump(liste_chaine, f, indent=4)

def load_jsonfile(file):
    with open(file, 'r', encoding='utf-8') as f:
        return json.load(f)

parsechaine()
data=load_jsonfile('chaine.json')
print(data["0"])