Cours et astuces en programmation et conception de sites Internet

Code en vrac

Data scraping: les Unes de sites d'info en direct

Avec Python et la bibliothèque Beautiful Soup, gratter le web est devenu un jeu d'enfant!

Ce script se connecte à plusieurs sites d'information et récupère le titre qui fait la Une et l'affiche sur le terminal d'où est lancé le programme.


# -*- coding: utf8 -*-
import time
import urllib2
from bs4 import BeautifulSoup
import re

class CrawlerTitreUne:
	def __init__(self, media, url, delimiters):
		self.media = media
		self.url = url
		self.delimiters = delimiters
		#self.last =  last
	def content(self):
		try:
			req = urllib2.Request(self.url)
			response = urllib2.urlopen(req)
			if self.media in('figaro'): return response.read()
			else: return BeautifulSoup(response.read())
		except:
			print(":::  ", self.url)
			pass
	def __repr__(self, titre):
		print(time.strftime("%c"), self.media , '\n'+titre)
		print('\n\n')
	def titre(self):
		if self.content():
			if self.media in ('liberation'):
				titre_courant = self.content().find(self.delimiters[0])
				return [titre_courant.find('a').get('href'), titre_courant.find('a').text.strip().replace('\n', '')]
			elif self.media in ('post'):
				results = self.content().findAll('h1')[1]
				titre = results.findAll('a')
				tampon = ''
				for t in titre:
					if t.text: tampon += t.text.replace('\n', '').replace('\t', '').replace('\r', '')
				return [results.find('a').get('href'), tampon]

			elif self.media in ('figaro'):
				html = self.content()
				i = html.find(';<h2 class="fig-profil-headline";>;<a title=')
				lien = ''
				while 'href="' not in lien:
					lien += html[i]
					i+=1
				lien = ''
				while html[i] != '"':
					lien+= html[i]
					i += 1
				i += 2
				titre_courant = ''
				while html[i] != ';<':
					titre_courant += html[i]
					i += 1
				return [lien, titre_courant]
			else:				
				titre_courant = self.content().find(self.delimiters[0], 
									attrs={self.delimiters[1] : self.delimiters[2]})
				return [titre_courant.find('a').get('href'), 
			re.sub(r'   \d+', '', titre_courant.find('a').text.strip().replace('\n', ''))]
	def pending(self):
			return self.titre()


ti = CrawlerTitreUne('parisien', 'http://leparisien.fr', ['article', 'class', 'article-une clearfix '])
print([x.encode('utf8') for x in ti.pending()])

On peut ainsi créer un crawler par site et lancer le tout via un multi thread