just test
click me
第一章 爬虫入门
请确认可以打开:https://www.python.org/events/pythonevents
安装好requests、bs4,然后我们开始实例1:Requests和Beautiful Soup 爬取python.org,
# pip3 install requests bs4
01_events_with_requests.py
import requests
from bs4 import BeautifulSoup
def get_upcoming_events(url):
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
events = soup.find('ul', {'class': 'list-recent-events'}).findAll('li')
for event in events:
event_details = dict()
event_details['name'] = event.find('h3').find("a").text
event_details['location'] = event.find('span', {'class', 'event-location'}).text
event_details['time'] = event.find('time').text
print(event_details)
get_upcoming_events('https://www.python.org/events/python-events/')
执行结果:
$ python3 01_events_with_requests.py
{'name': 'PyCon US 2018', 'location': 'Cleveland, Ohio, USA', 'time': '09 May – 18 May 2018'}
{'name': 'DjangoCon Europe 2018', 'location': 'Heidelberg, Germany', 'time': '23 May – 28 May 2018'}
{'name': 'PyCon APAC 2018', 'location': 'NUS School of Computing / COM1, 13 Computing Drive, Singapore 117417, Singapore', 'time': '31 May – 03 June 2018'}
{'name': 'PyCon CZ 2018', 'location': 'Prague, Czech Republic', 'time': '01 June – 04 June 2018'}
{'name': 'PyConTW 2018', 'location': 'Taipei, Taiwan', 'time': '01 June – 03 June 2018'}
{'name': 'PyLondinium', 'location': 'London, UK', 'time': '08 June – 11 June 2018'}
注意:因为事件的内容未必相同,所以每次的结果也不会一样
课后习题: 用requests爬取https://china-testing.github.io/首页的博客标题,共10条。
参考答案:
01_blog_title.py
import requests
from bs4 import BeautifulSoup
def get_upcoming_events(url):
req = requests.get(url)
soup = BeautifulSoup(req.text, 'lxml')
events = soup.findAll('article')
for event in events:
event_details = {}
event_details['name'] = event.find('h1').find("a").text
print(event_details)
get_upcoming_events('https://china-testing.github.io/')
执行结果:
$ python3 01_blog_title.py
{'name': '10分钟学会API测试'}
{'name': 'python数据分析快速入门教程4-数据汇聚'}
{'name': 'python数据分析快速入门教程6-重整'}
{'name': 'python数据分析快速入门教程5-处理缺失数据'}
{'name': 'python库介绍-pytesseract: OCR光学字符识别'}
{'name': '软件自动化测试初学者忠告'}
{'name': '使用opencv转换3d图片'}
{'name': 'python opencv3实例(对象识别和增强现实)2-边缘检测和应用图像过滤器'}
{'name': 'numpy学习指南3rd3:常用函数'}
{'name': 'numpy学习指南3rd2:NumPy基础'}
代码:02_events_with_urlib3.py
import urllib3
from bs4 import BeautifulSoup
def get_upcoming_events(url):
req = urllib3.PoolManager()
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
events = soup.find('ul', {'class': 'list-recent-events'}).findAll('li')
for event in events:
event_details = dict()
event_details['name'] = event.find('h3').find("a").text
event_details['location'] = event.find('span', {'class', 'event-location'}).text
event_details['time'] = event.find('time').text
print(event_details)
get_upcoming_events('https://www.python.org/events/python-events/')
requests对urllib3进行了封装,一般是直接使用requests。
Scrapy是用于提取数据的非常流行的开源Python抓取框架。 Scrapy提供所有这些功能以及许多其他内置模块和扩展。当涉及到使用Python进行挖掘时,它也是我们的首选工具。
Scrapy提供了许多值得一提的强大功能:
有几种使用Scrapy的方法。一个是程序模式我们在代码中创建抓取工具和蜘蛛。也可以配置Scrapy模板或生成器项目,然后从命令行使用运行。本书将遵循程序模式,因为它的代码在单个文件中。
代码:03_events_with_scrapy.py
import scrapy
from scrapy.crawler import CrawlerProcess
class PythonEventsSpider(scrapy.Spider):
name = 'pythoneventsspider'
start_urls = ['https://www.python.org/events/python-events/',]
found_events = []
def parse(self, response):
for event in response.xpath('//ul[contains(@class, "list-recent-events")]/li'):
event_details = dict()
event_details['name'] = event.xpath('h3[@class="event-title"]/a/text()').extract_first()
event_details['location'] = event.xpath('p/span[@class="event-location"]/text()').extract_first()
event_details['time'] = event.xpath('p/time/text()').extract_first()
self.found_events.append(event_details)
if __name__ == "__main__":
process = CrawlerProcess({ 'LOG_LEVEL': 'ERROT630:~/code/china-testing/python3_libraries/pytest_testing/ch2/tasks_proj/tests/func$ pytest test_api_exceptions.py -v -m "smoke and not get"
=========================================== test session starts ===========================================
platform linux -- Python 3.5.2, pytest-3.5.1, py-1.5.3, pluggy-0.6.0 -- /usr/bin/python3
cachedir: ../.pytest_cache
rootdir: /home/andrew/code/china-testing/python3_libraries/pytest_testing/ch2/tasks_proj/tests, inifile: pytest.ini
collected 7 items / 6 deselected
test_api_exceptions.py::test_list_raises PASSED [100%]
R'})
process.crawl(PythonEventsSpider)
spider = next(iter(process.crawlers)).spider
process.start()
for event in spider.found_events: print(event)
课后习题: 用scrapy爬取https://china-testing.github.io/首页的博客标题,共10条。
参考答案:
03_blog_with_scrapy.py
from scrapy.crawler import CrawlerProcess
class PythonEventsSpider(scrapy.Spider):
name = 'pythoneventsspider'
start_urls = ['https://china-testing.github.io/',]
found_events = []
def parse(self, response):
for event in response.xpath('//article//h1'):
event_details = dict()
event_details['name'] = event.xpath('a/text()').extract_first()
self.found_events.append(event_details)
if __name__ == "__main__":
process = CrawlerProcess({ 'LOG_LEVEL': 'ERROR'})
process.crawl(PythonEventsSpider)
spider = next(iter(process.crawlers)).spider
process.start()
for event in spider.found_events: print(event)
04_events_with_selenium.py
from selenium import webdriver
def get_upcoming_events(url):
driver = webdriver.Chrome()
driver.get(url)
events = driver.find_elements_by_xpath('//ul[contains(@class, "list-recent-events")]/li')
for event in events:
event_details = dict()
event_details['name'] = event.find_element_by_xpath('h3[@class="event-title"]/a').text
event_details['location'] = event.find_element_by_xpath('p/span[@class="event-location"]').text
event_details['time'] = event.find_element_by_xpath('p/time').text
print(event_details)
driver.close()
get_upcoming_events('https://www.python.org/events/python-events/')
改用driver = webdriver.PhantomJS('phantomjs')可以使用无界面的方式,代码如下:
05_events_with_phantomjs.py
from selenium import webdriver
def get_upcoming_events(url):
driver = webdriver.Chrome()
driver.get(url)
events = driver.find_elements_by_xpath('//ul[contains(@class, "list-recent-events")]/li')
for event in events:
event_details = dict()
event_details['name'] = event.find_element_by_xpath('h3[@class="event-title"]/a').text
event_details['location'] = event.find_element_by_xpath('p/span[@class="event-location"]').text
event_details['time'] = event.find_element_by_xpath('p/time').text
print(event_details)
driver.close()
get_upcoming_events('https://www.python.org/events/python-events/')
不过selenium的headless模式已经可以更好的代替phantomjs了。
04_events_with_selenium_headless.py
from selenium import webdriver
def get_upcoming_events(url):
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)
events = driver.find_elements_by_xpath('//ul[contains(@class, "list-recent-events")]/li')
for event in events:
event_details = dict()
event_details['name'] = event.find_element_by_xpath('h3[@class="event-title"]/a').text
event_details['location'] = event.find_element_by_xpath('p/span[@class="event-location"]').text
event_details['time'] = event.find_element_by_xpath('p/time').text
print(event_details)
driver.close()
get_upcoming_events('https://www.python.org/events/python-events/')
微信关注我们
转载内容版权归作者及来源网站所有!
低调大师中文资讯倾力打造互联网数据资讯、行业资源、电子商务、移动互联网、网络营销平台。持续更新报道IT业界、互联网、市场资讯、驱动更新,是最及时权威的产业资讯及硬件资讯报道平台。
近一个月的开发和优化,本站点的第一个app全新上线。该app采用极致压缩,本体才4.36MB。系统里面做了大量数据访问、缓存优化。方便用户在手机上查看文章。后续会推出HarmonyOS的适配版本。
Nacos /nɑ:kəʊs/ 是 Dynamic Naming and Configuration Service 的首字母简称,一个易于构建 AI Agent 应用的动态服务发现、配置管理和AI智能体管理平台。Nacos 致力于帮助您发现、配置和管理微服务及AI智能体应用。Nacos 提供了一组简单易用的特性集,帮助您快速实现动态服务发现、服务配置、服务元数据、流量管理。Nacos 帮助您更敏捷和容易地构建、交付和管理微服务平台。
Rocky Linux(中文名:洛基)是由Gregory Kurtzer于2020年12月发起的企业级Linux发行版,作为CentOS稳定版停止维护后与RHEL(Red Hat Enterprise Linux)完全兼容的开源替代方案,由社区拥有并管理,支持x86_64、aarch64等架构。其通过重新编译RHEL源代码提供长期稳定性,采用模块化包装和SELinux安全架构,默认包含GNOME桌面环境及XFS文件系统,支持十年生命周期更新。
Sublime Text具有漂亮的用户界面和强大的功能,例如代码缩略图,Python的插件,代码段等。还可自定义键绑定,菜单和工具栏。Sublime Text 的主要功能包括:拼写检查,书签,完整的 Python API , Goto 功能,即时项目切换,多选择,多窗口等等。Sublime Text 是一个跨平台的编辑器,同时支持Windows、Linux、Mac OS X等操作系统。