爬虫避免进局子的风险
爬虫的分类:
爬虫的矛盾
反爬机制
http协议
常用的请求头:
常见的响应头
https协议
加密方式
requests模块:python中原生的一款基于网络请求的模块,功能非常强大,简单便捷,效率极高.
作用:模拟浏览器发送请求
如何使用:
环境安装: pip install requests
实战编码:
import requests
heads = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"}
def get_url():
kw = input('请输入检索关键字')
return "https://cn.bing.com/search?q=" + kw
def get_data(url):
res = requests.get(url, headers=heads)
save_file("检索结果数据", res.text)
def save_file(name, data):
with open(name + ".html", "w", encoding="utf-8") as wf:
wf.write(data)
if __name__ == "__main__":
url = get_url()
get_data(url)
数据解析分类
ex=‘<div class="thumb">.*?<img src="(.*?)" alt.*?</div>’
数据解析概况:
解析的局部的文本内容都会在标签之间或者标签对应的属性中进行存储
环境安装
如何实例化BeautifulSoup对象:
from bs4 import BeautifulSoup
对象的实例化
提供的用于数据解析的方法和属性
soup.tagName:返回的是文档中第一次出现的tagName对应的标签,如soup.title
soup.find():find('tagname')等同于soup.div-属性定位soup.find('div',class_/id/attr='song)
soup.find_all('tagName',)返回符合要求的所有标签(列表)
select:
select('某种选择器(id,class,标签...选择器'))返回的是一个列表.
层级选择器:
--soup.select('.tang > ul > li > a')>标识是一个层级
--soup.select('.tang > ul a')>空格标识多个层级
获取标签之间的数据
soup.a.text/string.get_text()text/get_text()可以过去摸一个标签中所有的文本内容
string 值可以获取该标签下的文本内容
获取标签中的属性值
soup.a['href']
xpath解析:最常用最便捷的高效的一种解析方式,通用性.
xpath 解析原理:
环境的安装:
使用:result = html.xpath('//li')
print(result)
xpath表达式:
验证码和爬虫之间的爱恨情仇?
反爬虫机制:验证码。识别验证码图片中的数据,用于模拟登录操作
识别验证码的操作:
import requests
import base64
def get_ocr_img(filePath):
url='https://www.jfbym.com/api/YmServer/verifyapi'
base=encode_base64(filePath)
data={
"image":base,
"token":"lvfJhEHuS+2Kk5HWWFGz2pYZOOiaDvLeIauN35puIc=",
"type":"10101"
}
headers={
"Content-Type":"application/x-www-form-urlencoded"
}
res=requests.post(url,data=data,headers=headers)
print(res.text)
def encode_base64(file):
with open(file, 'rb') as f:
img_data = f.read()
base64_data = base64.b64encode(img_data)
print(type(base64_data))
# print(base64_data)
# 如果想要在浏览器上访问base64格式图片,需要在前面加上:data:image/jpeg;base64,
base64_str = str(base64_data, 'utf-8')
print(base64_str)
return base64_data
def decode_base64(base64_data):
with open('./images/base64.jpg', 'wb') as file:
img = base64.b64decode(base64_data)
file.write(img)
if __name__ == '__main__':
get_ocr_img('./yzm.jpg')
http/https协议特征:无状态。
没有请求到对应页面数据的原因
cookie:用来让服务器端记录客户端的相应状态:
什么是代理?-代理服务器
代理的作用:突破自身IP访问的限制,隐藏自身的真实IP
代理相关网站:快代理,西祠代理,www.goubanjia.com
代理IP的类型
使用方法:res=requests.get(url,headers=headers,proxies={'https':'103.103.3.6:8080'}).text
代理ip的匿名度
import requests
url ='https://ip.hao86.com/'
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37'}
res=requests.get(url,headers=headers,proxies={'https':'10.103.3.6:8080'}).text
with open('./dd.html','w',encoding='utf-8') as wf:
wf.write(res)
目的:在爬虫中使用异步实现高性能的数据爬取操作
异步爬虫的方式:
不使用线程池方式
import time
def get_page(str):
print('start-',str)
time.sleep(2)
print('end-',str)
name_list=['xiaozi','aaa','bbb','ccc']
start_time=time.time()
for i in range(len(name_list)):
get_page(name_list[i])
end_time=time.time()
print('%d sdde'%(end_time-start_time)) # 8 sdde
使用线程池方式
import time
from multiprocessing.dummy import Pool
def get_page(str):
print('start-',str)
time.sleep(2)
print('end-',str)
name_list=['xiaozi','aaa','bbb','ccc']
start_time=time.time()
# for i in range(len(name_list)):
# get_page(name_list[i])
# 实例化线程池对象
pool=Pool(4)# 开启4个线程池
# 将列表中每一个列表元素传递给get_page进行处理,返回的结果就是get_page的return
pool.map(get_page,name_list)
end_time=time.time()
print('%d sdde'%(end_time-start_time)) # 2 sdde
使用单线程+异步协程
import asyncio
async def request_url(url):
print('正在请求的url时',url)
print('请求成功',url)
return url
c=request_url('www.baidu.com')
# 创建一个事件循环对象
# loop=asyncio.get_event_loop()
# # 将协程对象注册到loop中然后启动loop
# loop.run_until_complete(c)
# asyncio.run(c) # 上述的封装方法
# task的使用
# loop=asyncio.get_event_loop()
# task=loop.create_task(c,name='task1')
# print(task)
# loop.run_until_complete(task)
# # future
# loop=asyncio.get_event_loop()
# task=asyncio.ensure_future(c)
# print(task)
# loop.run_until_complete(task)
# 绑定回调
def callback_func(task):
print(task.result())
loop=asyncio.get_event_loop()
task=asyncio.ensure_future(c)
# 将回调函数绑定到任务对象中,task执行完成后就会调用该函数
task.add_done_callback(callback_func)
loop.run_until_complete(task)
import asyncio
import time
async def request_url(url):
print('正在下载',url)
await asyncio.sleep(2)
print('下载完成',url)
start=time.time()
urls=[
'baidu',
'google',
'sougou',
'bing'
]
# stasks=[]
# for ul in urls:
# c=request_url(ul)
# task=asyncio.ensure_future(c)
# stasks.append(task)
tasks=[request_url(url) for url in urls]
loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
ends=time.time()
print(ends-start)
案例
import asyncio
import time
import requests
import aiohttp
urls=[
'http://127.0.0.1:5000/b1',
'http://127.0.0.1:5000/b2',
'http://127.0.0.1:5000/b3',
]
async def get_page(url):
print('正在下载',url)
# requests.get是基于同步,必须使用基于异步的网络请求模块进行指定url的请求发送,或者给他弄成线程再转换为协程
# aiohttp基于异步网络请求的模块
async with aiohttp.ClientSession() as session:
# get(),post()
# headers,params/data,proxy='http://ip:port'
async with session.get(url) as res:
pageText= await res.text()
print('下载完毕',pageText)
tasks=[get_page(url) for url in urls]
start=time.time()
asyncio.run(asyncio.wait(tasks))
ends=time.time()
print('总耗时',ends-start) # 总耗时 2.0147018432617188
错误演示//request没有协程
import asyncio
import time
import requests
urls=[
'http://127.0.0.1:5000/b1',
'http://127.0.0.1:5000/b2',
'http://127.0.0.1:5000/b3',
]
async def get_page(url):
print('正在下载',url)
res=requests.get(url)
print('下载完毕',res.text)
tasks=[get_page(url) for url in urls]
start=time.time()
asyncio.run(asyncio.wait(tasks))
ends=time.time()
print('总耗时',ends-start)
后端代码 类似下方方式
from flask import Flask
import time
app=Flask(__name__)
@app.route('/b1')
def index_b1():
time.sleep(2)
return 'Hello b1'
@app.route('/b2')
def index_b2():
time.sleep(2)
return 'Hello b2'
@app.route('/b3')
def index_b3():
time.sleep(2)
return 'Hello b3'
if __name__=='__main__':
app.run(threaded=True)
示例
发送get请求
import urllib.request
url='https://funletu.com/dong-tai/page/2'
req=urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
data=response.read()
ss=data.decode()
print(ss)
发送post请求
import urllib.request
import urllib.parse
url='https://funletu.com/dong-tai/page/2'
params_dict={'id':123,'query':'all'}
params_str=urllib.parse.urlencode(params_dict)
params_bytes=params_str.encode()
req=urllib.request.Request(url,data=params_bytes)
with urllib.request.urlopen(req) as response:
data=response.read()
ss=data.decode()
print(ss)