| 
	
	
	
	
		
		| import requests from flask import json
 from requests.exceptions import RequestException
 import re
 from multiprocessing import Pool
 
 '''
 Request+正则表达式抓取猫眼电影
 '''
 
 '''
 获取第一页的内容
 '''
 def getOneContent(url,headers):
 try:
 response = requests.get(url,headers=headers)
 if response.status_code == 200:
 return response.text
 return None
 except RequestException:
 return None
 
 '''
 解析内容,根据正则表达式
 '''
 def parserContent(content):
 if content:
 # pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>'
 #            +'.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?<dd>',re.S)
 
 # 字符串换行不需要添加“+”,上面这种写法是错误的。
 pattern = re.compile('<dd.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?</a>.*?<a.*?data-val.*?>(.*?)</a>.*?star.*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>'
 '.*?integer.*?>(.*?)</i>.*?fraction.*?>(\d+)</i>.*?</dd>',re.S)
 results = re.findall(pattern,content)
 # print(results)
 return results
 
 def processData(results):
 for result in results:
 yield {
 'index':result[0],
 'imgurl':result[1],
 'name':result[2],
 'star':result[3].strip()[3:],
 'releasetime':result[4].strip()[5:],
 'score':result[5]+result[6]
 }
 # print(result)
 
 def storeData(data):
 '''
 为了防止出现unicode码
 :param data: 需要写入文本的数据
 :return: 无返回值
 '''
 with open("mmovie.txt",'a',encoding='utf-8') as f:
 f.write(json.dumps(data,ensure_ascii=False)+'\n')
 f.close()
 
 def main(offset):
 url = 'http://maoyan.com/board/4?offset='+str(offset)
 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'}
 html = getOneContent(url,headers=headers)
 # print(html)
 results = parserContent(html)
 for item in processData(results):
 storeData(item)
 
 if __name__ == '__main__':
 # for i in range(10):
 #     main(i*10)
 pool = Pool()
 pool.map(main,[i*10 for i in range(10)])
 
 首发:传智播客人工智能+pathon培训学院
 作者:http://python.itcast.cn/
 |  |