本文共 5052 字,大约阅读时间需要 16 分钟。
#!/usr/bin/env python# -*- coding:utf-8 -*-"""@author:Aiker@file:toutiao.py@time:下午9:35"""import jsonimport osimport refrom json import JSONDecodeErrorfrom multiprocessing import Poolfrom urllib.parse import urlencodefrom hashlib import md5import pymongoimport requestsfrom requests.exceptions import RequestExceptionMONGO_URL = 'localhost:27017'MONGO_DB = 'toutiao'MONGO_TABLE = 'toutiao'GROUP_START = 1GROUP_END = 20KEYWORD = '街拍'client = pymongo.MongoClient(MONGO_URL, connect=False)db = client[MONGO_DB]headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}def get_url(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求失败', url) return Nonedef get_page_index(offset, keyword): data = { 'aid': '24', 'app_name': 'web_search', 'offset': offset, 'format': 'json', 'keyword': keyword, 'autoload': 'true', 'count': '20', 'en_qc': '1', 'cur_tab': '1', 'from': 'search_tab', 'pd': 'synthesis', 'timestamp': '1124216535987' } url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data) # 字典对象转化url对象 try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求索引页失败') return Nonedef parse_page_index(html): try: data = json.loads(html) # 转化为json对象 if data and 'data' in data.keys(): # print(data.keys()) #调试,输出所有key for item in data.get('data'): if 'article_url' in item: # 判断是否存在,避免出现None # print(item) yield item.get('article_url') # 构造生成器 except JSONDecodeError: pass except TypeError: passdef get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: print('请求详情页出错', url) return Nonedef parse_page_detail(html, url): pattern = re.compile("articleInfo:.*?title:\s'(.*?)',.*?content:\s'(.*?)'.*?groupId", re.S) result = re.findall(pattern, html) # print(tc) if result: title, content = result[0] pattern = re.compile("(http://.*?)"", re.S) images = re.findall(pattern, content) # print(img) for image in images: download_image(image, title) # print(item) return { 'title': title, 'url': url, 'images': images } else: pattern = re.compile('BASE_DATA.galleryInfo.*?title:\s\'(.*?)\'.*?gallery: JSON.parse\("(.*)"\)', re.S) result = re.findall(pattern, html) # print(result[0]) if result: title, content = result[0] data = json.loads(content.replace('\\', '')) # print(data) if data and 'sub_images' in data.keys(): sub_images = data.get('sub_images') images = [item.get('url') for item in sub_images] for image in images: download_image(image,title) return { 'title': title, 'url': url, 'images': images }def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('存储到MongoDB成功', result) return True return Falsedef download_image(url,title): print('正在下载', url) try: response = requests.get(url) if response.status_code == 200: save_image(response.content,title) return None except RequestException: print('请求图片出错', url) return Nonedef save_image(content,title): try: if title: title = re.sub('[:?!!:?]', '', title) # 替换title中的特殊字符,避免建立资料夹目录出错 dir = 'z:\\toutiao\\' if os.path.exists(dir + title): pass else: os.mkdir(dir + title) file_path = '{0}/{1}.{2}'.format( dir + title, md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() except OSError: passdef main(offset): html = get_page_index(offset, KEYWORD) for url in parse_page_index(html): print(url) html = get_page_detail(url) if html: result = parse_page_detail(html, url) if result: save_to_mongo(result) # print(html)if __name__ == '__main__': # main() groups = [x * 20 for x in range(GROUP_START, GROUP_END + 1)] pool = Pool() pool.map(main, groups) pool.close() pool.join()
转载于:https://blog.51cto.com/m51cto/2374411