您现在的位置是:首页 > 文章详情

Python网络爬虫(requests, 代理,Web认证, SSL证书认证)

日期:2018-11-04点击:555
requests模块
    代理(proxies)
        西刺代理
        快代理
        全网代理
        高匿ip:看不到真实ip
        透明ip:可以看到代理 和 真实ip
    普通代理
        proxies = {"协议":"协议://IP地址:端口号"}

'''01_普通代理示例.py''' import requests url = "http://www.baidu.com/" proxies = {"http":"http://183.129.207.82:11597"} headers = {"User-Agent":"Mozilla/5.0"} res = requests.get(url,proxies=proxies,headers=headers) print(res.status_code) 

    私密代理
        proxies = {"协议": "协议://用户名:密码@ip地址:端口号"}

'''02_私密代理示例.py''' import requests url = "http://httpbin.org/get" headers = {"User-Agent":"Mozilla/5.0"} proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"} res = requests.get(url,proxies=proxies,headers=headers) res.encoding = "utf-8" print(res.text) 

爬取链家二手房信息 --> 存到MySQL数据库中

'''05_链家数据ToMongo.py''' import requests import re import pymysql import warnings class LianjiaSpider: def __init__(self): self.baseurl = "https://bj.lianjia.com/ershoufang/pg" self.page = 1 self.headers = {"User-Agent": "Mozilla/5.0"} self.proxies = {"http": "http://127.0.0.1:8888"} self.db = pymysql.connect("localhost", "root","ParisPython",charset="utf8") self.cursor = self.db.cursor() def getPage(self,url): res = requests.get(url,proxies=self.proxies,headers=self.headers,timeout=5) res.encoding = "utf-8" html = res.text print("页面爬取成功,正在解析...") self.parsePage(html) def parsePage(self,html): p = re.compile('<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>(.*?)</div>',re.S) r_list = p.findall(html) # [("天通苑","480","万"),()..] print("页面解析完成,正在存入数据库...") self.writeTomysql(r_list) def writeTomysql(self,r_list): c_db = "create database if not exists Lianjiadb \ character set utf8" u_db = "use Lianjiadb" c_tab = "create table if not exists housePrice( \ id int primary key auto_increment,\ housename varchar(50), \ totalprice int)charset=utf8" warnings.filterwarnings("ignore") try: self.cursor.execute(c_db) self.cursor.execute(u_db) self.cursor.execute(c_tab) except Warning: pass ins = "insert into housePrice(housename,totalprice) \ values(%s,%s)" for r_tuple in r_list: name = r_tuple[0].strip() price = float(r_tuple[1].strip())*10000 L = [name,price] self.cursor.execute(ins,L) self.db.commit() print("存入数据库成功") def workOn(self): while True: c = input("爬取按y(q退出):") if c.strip().lower() == "y": url = self.baseurl + str(self.page) + "/" self.getPage(url) self.page += 1 else: self.cursor.close() self.db.close() print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = LianjiaSpider() spider.workOn()

    找URL
      https://bj.lianjia.com/ershoufang/pg1/
    正则
<div class="houseInfo".*?data-el="region">(.*?)</a>.*?<div="totalPrice">.*?<span>(.*?)</span>(.*?)</div>
  Web客户端验证(参数名:auth)
    auth=("用户名","密码")
    案例 :09_Web客户端验证.py

'''09_Web客户端验证.py''' import requests import re class NoteSpider: def __init__(self): self.headers = {"User-Agent":"Mozilla/5.0"} self.url = "网址" self.proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"} # auth参数存储用户名和密码(必须为元组) self.auth = ("账号","密码") def getParsePage(self): res = requests.get(self.url, proxies=self.proxies, headers=self.headers, auth=self.auth, timeout=3) res.encoding = "utf-8" html = res.text print(html) p = re.compile('<a href=".*?>(.*?)</a>',re.S) r_list = p.findall(html) print(r_list) self.writePage(r_list) def writePage(self,r_list): print("开始写入文件...") with open("达内科技.txt","a") as f: for r_str in r_list: f.write(r_str + "\n\n") print("写入成功") if __name__ == "__main__": spider = NoteSpider() spider.getParsePage() 

  SSL证书认证(参数名:verify)
    verify = True : 默认,进行SSL证书认证
    verify = False: 不做认证

'''10_SSL证书认证示例.py''' import requests url = "https://www.12306.cn/mormhweb/" headers = {"User-Agent":"Mozilla/5.0"} res = requests.get(url,headers=headers,verify=False) res.encoding = "utf-8" print(res.text)

urllib.request中Handler处理器
  定义
    自定义的urlopen()方法,urlopen()方法是一个特殊的opener(模块已定义好),
    不支持代理等功能,通过Handler处理器对象来自定义opener对象
  常用方法
    build_opener(Handler处理器对象) :创建opener对象
    opener.open(url,参数)

# 创建Handler处理器对象 http_handler = urllib.request.HTTPHandler() #proxy_handler = urllib.request.ProxyHandler() # 创建自定义的opener对象 opener = urllib.request.build_opener(http_handler) # 利用opener对象的open()方法发请求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8")) 

  Handler处理器分类
    HTTPHandler() :没有任何特殊功能
    ProxyHandler(普通代理)
      代理: {"协议":"IP地址:端口号"}
    ProxyBasicAuthHandler(密码管理器对象) :私密代理
    HTTPBasicAuthHandler(密码管理器对象) : web客户端认证
    密码管理器对象作用
      私密代理
      Web客户端认证
      程序实现流程
        创建密码管理器对象
  pwdmg = urllib.request.HTTPPasswordMgrWithDefaultRealm()
把认证信息添加到密码管理器对象
  pwdmg.add_password(None,webserver,user,passwd)
创建Handler处理器对象
  私密代理
    proxy = urllib.request.ProxyAuthBasicHandler(pwdmg)
  Web客户端
    webbasic = urllib.request.HTTPBasicAuthHandler(pwdmg)

# 创建Handler处理器对象 pro_hand = urllib.request.ProxyHandler(proxy) # 创建自定义opener对象 opener = urllib.request.build_opener(pro_hand) # opener对象open方法发请求 req = urllib.request.Request(url) res = opener.open(req) print(res.read().decode("utf-8"))

爬取猫眼电影排行榜存入MongoDB数据库

'''06_猫眼电影top100抓取.py''' import requests import re import pymongo class MaoyanSpider: def __init__(self): self.baseurl = "http://maoyan.com/board/4?offset=" self.headers = {"User-Agent":"Mozilla/5.0"} self.page = 1 self.offset = 0 self.proxies = {"http":"http://309435365:szayclhp@123.206.119.108:16817"} self.conn = pymongo.MongoClient("localhost",27017) self.db = self.conn.Film self.myset = self.db.top100 # 下载页面 def loadPage(self,url): res = requests.get(url,headers=self.headers) res.encoding = "utf-8" html = res.text self.parsePage(html) # 解析页面 def parsePage(self,html): p = re.compile('<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?releasetime">(.*?)</p>',re.S) r_list = p.findall(html) # print(r_list) # [("霸王别姬","张国荣","1994-01-01"),(),()...] self.writeTomysql(r_list) def writeTomysql(self,r_list): for r_tuple in r_list: name = r_tuple[0].strip() star = r_tuple[1].strip() releasetime = r_tuple[2].strip() D = {"name":name, "star":star, "releasetime":releasetime} self.myset.insert(D) print("存入数据库成功") def workOn(self): while True: c = input("爬取请按y(y/n):") if c.strip().lower() == "y": self.offset = (self.page-1)*10 url = self.baseurl + str(self.offset) self.loadPage(url) self.page += 1 else: print("爬取结束,谢谢使用!") break if __name__ == "__main__": spider = MaoyanSpider() spider.workOn() 


原文链接:https://yq.aliyun.com/articles/665023
关注公众号

低调大师中文资讯倾力打造互联网数据资讯、行业资源、电子商务、移动互联网、网络营销平台。

持续更新报道IT业界、互联网、市场资讯、驱动更新,是最及时权威的产业资讯及硬件资讯报道平台。

转载内容版权归作者及来源网站所有,本站原创内容转载请注明来源。

文章评论

共有0条评论来说两句吧...

文章二维码

扫描即可查看该文章

点击排行

推荐阅读

最新文章