博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Beautiful Soup多线程爬取斗鱼所有主播信息(改良版)
阅读量:4683 次
发布时间:2019-06-09

本文共 4131 字,大约阅读时间需要 13 分钟。

花点时间改良了一下代码。如下

import requestsfrom bs4 import BeautifulSoupimport pymongoimport lxmlimport time, datetimeclass douyu_host_info():    def __init__(self):        self.url_host = 'https://www.douyu.com'        self.date_time = datetime.datetime.now().strftime('%Y%m%d_%H%M')        self.url_list = []        self.headers = {        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',        }        # 类别后缀列表,由于是固定不变的,第一次抓取后直接保存起来        self.categorys_list =[             '/g_LOL', '/g_blzy', '/g_DOTA2', '/g_qipai', '/g_DNF', '/g_CF', '/g_mszb', '/g_CSGO', '/g_How', '/g_DOTA',             '/g_WOW', '/g_nsh', '/g_Overwatch', '/g_wxy', '/directory/category/PCgame', '/g_jdqs', '/g_TVgame',             '/g_gwlrsj', '/g_FTG', '/g_xyqx', '/g_NBA2K', '/g_BF', '/g_DG', '/directory/category/djry', '/g_wzry',             '/g_jdqscjzc', '/g_jdqsqjcj', '/g_qqfcsy', '/g_hyrz', '/g_xyzx', '/g_HLMJ', '/g_phone', '/g_LRSZQ',             '/g_mhmnz', '/g_CFSY', '/directory/category/syxx', '/g_yz', '/g_xingyu', '/g_ecy', '/g_yqk', '/g_HW',             '/g_ms', '/g_music', '/g_ip', '/directory/category/yl', '/g_smkj', '/g_yj', '/g_Finance', '/g_kepu',             '/g_js', '/g_car', '/g_jlp', '/g_tkx', '/directory/sport/cate', '/g_FM233', '/g_yydt', '/g_lianmaihudong',             '/g_qinggan', '/directory/category/voice', '/g_znl'        ]    def Mongodb_set(self, sheet_name, r_data):        client = pymongo.MongoClient('localhost', 27017)        douyu = client['douyu']        sheet_name = douyu[sheet_name]        print(r_data)        sheet_name.insert_one(r_data)    def get_url_list(self):        for category in self.categorys_list:            category_url = self.url_host + category            self.url_list.append(category_url)            self.Mongodb_set(sheet_name='url_list', r_data={'url': category_url})        return self.url_list    def get_host_info(self, url):        time.sleep(0.2)        print('Now start open {}'.format(url))        for i in range(3):            try:                wb_data = requests.get(url, headers=self.headers)                break            except:                print('net work error! will retry 3 times')        soup = BeautifulSoup(wb_data.text, 'lxml')        print('start analazy url')        try:            category = soup.select('h1')[0].get_text()        except:            category = '未定義類別'        names = soup.select('.ellipsis.fl')        nums = soup.select('.dy-num.fr')        titles = soup.select('.mes h3')        hrefs = soup.select('#live-list-contentbox  li  a')        for name, num, href, title in zip(names, nums, hrefs, titles):            data = {                '類別': category,                '主播': name.get_text(),                '标题': title.get_text().split('\n')[-1].strip(),                '链接': self.url_host + href.get('href'),                '人氣指數': float(num.get_text()[:-1]) if '万'in num.get_text() else float(num.get_text())/10000,                '當前時間': self.date_time            }            if data['人氣指數'] > 2:                self.Mongodb_set(sheet_name='host_info_{}'.format(self.date_time), r_data=data)    def db_check(self, sheetname, key_word):        client = pymongo.MongoClient('localhost', 27017)        douyu = client['douyu']        sheetname = douyu[sheetname]        for data in sheetname.find(key_word):            print(data)

  

from multiprocessing import Poolfrom douyu_host_2 import douyu_host_infodouyu = douyu_host_info()def data_check():    #{u'當前時間':'20180901 10:58', u'人氣指數':{'$gte':30}}    #{'主播':'    # sheetname = input('Which sheet do you want to check')    sheetname = 'host_info_20180901_1530'    # key_word = input('Do you want to check with?')    key_word = {'類別': 'DOTA2'}    douyu.db_check(sheetname=sheetname, key_word=key_word)def w_to_db():    pool = Pool()    url_list = douyu.get_url_list()    pool.map(douyu.get_host_info, url_list)if __name__ == '__main__':    w_to_db()    data_check()

  这个爬虫没有包含翻页,只爬取了每个类别下面的首页,翻页爬所有主播请参考这个脚本。

 

 

  https://www.cnblogs.com/lkd8477604/p/9848958.html

转载于:https://www.cnblogs.com/lkd8477604/p/9570649.html

你可能感兴趣的文章
Factory 模式
查看>>
java集合
查看>>
数据结构之停车场
查看>>
Service 回顾
查看>>
单行、多行文本溢出
查看>>
Oracle11gR2 静默建库,删库和配置
查看>>
神经网络反向传播-2-代码
查看>>
CSS 框模型概述
查看>>
38.leetcode12_integer_to_roman
查看>>
VS2010建立MFC应用程序资源视图和类视图空白的解决方案
查看>>
移动平台开发第四周学习总结
查看>>
python排序
查看>>
20140712 合并有序数组
查看>>
【物联网智能网关-07】PCF8591 AD转换模块I2C通信实现
查看>>
React.js 开发参见问题 Q&A
查看>>
26 超链接导航栏案例
查看>>
bootstrap
查看>>
华为机试:字符串翻转
查看>>
server 打开失败
查看>>
搜查令第一周
查看>>