您现在的位置是：首页 > 文章详情

用xpath、bs4、re爬取B站python数据

日期：2018-08-06点击：517收藏

import requests,re from lxml import etree from bs4 import BeautifulSoup def get_page(page): try: #这里要加上cookie headers = { 'Cookie': 'LIVE_BUVID=AUTO5015218915110407; sid=4oag5i0u; fts=1521891539; pgv_pvi=3655305216; UM_distinctid=16257cdfffd2e4-032750a28294b2-3b60450b-100200-16257cdfffe2a0; buvid3=7B94813D-1039-4A88-A1EE-9AEFDF54BE05140244infoc; rpdid=kxsliqpkisdosikxllmww; CURRENT_QUALITY=80; finger=edc6ecda', 'Host': 'search.bilibili.com', 'Referer': 'https://www.bilibili.com/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } url = 'https://search.bilibili.com/all?keyword=python&from_source=banner_search&spm_id_from=333.334.banner_link.1&page={}'.format(str(page)) response = requests.get(url,headers=headers) if response.status_code == 200: return response.text except Exception: return None #用xpath解析网页,获取属性值用符号@ def xpath_parse_page(html): data = etree.HTML(html) items = data.xpath('//*[@class="video-contain clearfix"]/li') for item in items: yield { 'video_image' : item.xpath('./a/div/div[1]/img/@src')[0], 'video_title' : item.xpath('./div/div[1]/a/text()')[0], 'video_view' : item.xpath('./div/div[3]/span[1]/text()')[0].strip(), 'video_updateTime' : item.xpath('./div/div[3]/span[3]/text()')[0].strip(), 'video_up' : item.xpath('./div/div[3]/span[4]/a/text()')[0] } #用bs4解析网页 def bs4_parse_page(html): soup = BeautifulSoup(html,'lxml') items = soup.find_all('li',{'class':'video matrix'}) for item in items: yield{ 'video_image' : item.find('img').get('src'), 'video_title' : item.find('a',{'class':'title'}).get_text(), 'video_view': item.find('span',{'class':'so-icon watch-num'}).get_text().strip(), 'video_updateTime' : item.find('span', {'class': 'so-icon time'}).get_text().strip(), 'video_up': item.find('span',{'title':'up主'}).get_text()#这里用class的属性得到的结果是播放量，要用title } #用正则解析网页，关键是确定好字段的定位，一般来说定位在字段上面的class属性里 def re_parse_page(html): pattern = re.compile('<li.*?info.*?title="(.*?)".*?icon-playtime"></i>(.*?)</span>.*?icon-date"></i>(.*?)</span>.*?up-name">(.*?)</a>',re.S) items = re.findall(pattern,html) for item in items: yield { 'video_title': item[0], 'video_view': item[1].strip(), 'video_updateTime':item[2].strip(), 'video_up': item[3].strip() } def main(): #处理翻页 for page in range(1,2):#这里更改爬取的页数 html = get_page(page) for result in bs4_parse_page(html):#这里更改使用哪一种方式解析网页，xpath、bs4、re print(result) if __name__ == '__main__': main()

原文链接：https://yq.aliyun.com/articles/648019

关注公众号

低调大师中文资讯倾力打造互联网数据资讯、行业资源、电子商务、移动互联网、网络营销平台。

持续更新报道IT业界、互联网、市场资讯、驱动更新,是最及时权威的产业资讯及硬件资讯报道平台。

转载内容版权归作者及来源网站所有，本站原创内容转载请注明来源。