pip install scrapy
测试在终端里录入scrapy指令,没有报错即表示安装成功!
创建一个工程:scrapy startproject xxxPro
在spiders子目录他是爬虫的文件夹,在其中那个创建一个爬虫文件
执行工程
import scrapy
class QiubaiSpider(scrapy.Spider):
name = 'qiubai' # 爬虫文件的名称,就是爬虫源文件的唯一标识
# allowed_domains = ['www.qiubai.com'] # 允许的域名,用于限定start_urls那些URL那些可以进行请求发送(一般可以不用注释即可)
start_urls = ['http://www.baidu.com/', 'https://www.sogou.com/'] # 起始的URL列表,该列表中存放的URL会被scrapy自动的请求发送
# 用作于数据解析response表示的就是请求成功后的响应对象 parse执行的次数由start_urls的个数决定
def parse(self, response):
print(response,"-----")
,该执行会带你很多log如果只想看自己的可以是使用
scrapy crawl qiubai --nolog ,不打印日志的形式输出(不推荐)
推荐:在配置文件setings.py中设置:LOG_LEVEL='ERROR'
只显示error的错误日志
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.49'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
xpath返回的是一个列表,但列表元素一定是Selector类型的对象
extract可以将Seletor对象中的data参数存储的字符串提取出来
列表调用了extract之后,则表示将列表中每个selector对象中data对应字符串提取出来了
列表转换字符串方法ss="".join(list)
extract_first()将列表中第0个进行extract操作,将Seletor列表如果有一个列表元素建议使用;
import scrapy
class PicdSpider(scrapy.Spider):
name = 'picd'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://funletu.com/dong-tai/page/2']
def parse(self, response):
all_data=[]
aList = response.xpath('//div[@class="s-a-c-l"]/h2/a')
# 因为该数据最后有三个动态去除的话就是可以是使用少循环 range(len(aList-3))即可实现
for a in aList:
title=a.xpath('./text()').extract_first()
path=a.xpath('./@href').extract_first()
print(title,path)
dic={'title':title,'path':path}
all_data.append(dic)
return all_data
基于管道,(常用)
编码流程,1.数据解析,2. 再item定义相关属性,3.解析的数据封装存储到item类型的对象,4.将item类型的对象提交给管道进行持久化存储的操作5.在管道类的process_item中要将其接收到的item对象中存储的数据进行持久化存储操作;6.在配置文件中开启管道
好处:通用性强.
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PicsoproPipeline(object):
fp = None
def open_spider(self, spider):
print('开始-start')
self.fp = open('./qiubai.txt', 'w', encoding='utf-8')
# 专门用来处理item类型对象
# 该方法可以接受爬虫文件提交过来的item对象
# 该方法接受到一个item就会被调用一次
def process_item(self, item, spider):
titl = item['title']
path = item['path']
self.fp.write(titl + "---" + path + '\n')
return item # 就会传递给下一个即将被执行的管道类
def close_spider(self, spider):
print('结束-close')
self.fp.close()
# 管道文件中一个管道对应将一组数据存储到一个平台或者载体中
class mysqlPileLine(object):
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='192.168.101.11', port=3306, user='yanchenglong', password='a123456', charset='utf8',
db='day17')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into pachong values(null,"%s","%s")' % (item['title'], item['path']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
解释:就是将某板块下的全部页码对应的页面数据进行爬取
需求将图片网站的所有页码的图片和名称进行爬取
实现方式
import scrapy
class ExamplexhsjSpider(scrapy.Spider):
name = 'exampleXHSJ'
# allowed_domains = ['www.cheaa.org']
start_urls = ['https://www.cheaa.org/channels/116.html']
# 生成一个通用的url模板
url='https://www.cheaa.org/channels/116_%d.html'
page_number=2
def parse(self, response):
li_list=response.xpath('//table[@class="title_list"]/tr/td/a')
for li in li_list:
names=li.xpath('.//text()').extract_first()
link=li.xpath('./@href').extract_first()
print(names+"\t"+link)
if self.page_number<=3:
new_rul=format(self.url%self.page_number)
self.page_number+=1
# 如何进行手动发送请求,callback回调函数
yield scrapy.Request(url=new_rul,callback=self.parse)
import scrapy
from bpsPro.items import BpsproItem
# from scrapyDemo.bpsPro.bpsPro.items import BpsproItem
class BpsSpider(scrapy.Spider):
name = 'bps'
# allowed_domains = ['sousuo.gov.cn']
start_urls = ['http://sousuo.gov.cn/column/30214/1.htm']
url='http://sousuo.gov.cn/column/30214/%d.htm'
page_num=2
# 解析首页的name的
def parse(self, response):
li_list=response.xpath('//ul[@class="listTxt"]/li/h4')
for li in li_list:
name=li.xpath('./a//text()').extract_first()
# print(name)
item=BpsproItem()
item['name']=name
datal_ulr=li.xpath('./a/@href').extract_first()
# 发起详情页的请求
# 请求传参meta传入字典就会传递给对应的回调函数
yield scrapy.Request(datal_ulr,callback=self.parse_content,meta={'item':item})
# 分页操作
if self.page_num<=3:
new_url=format(self.url%self.page_num)
self.page_num+=1
yield scrapy.Request(new_url,self.parse)
def parse_content(self,response):
content=response.xpath('//div[@id="UCAP-CONTENT"]/p[1]//text()').extract_first()
item=response.meta['item']
item['content']=content
yield item
在使用过程中如果没有生成文件夹可以尝试安装一下
import scrapy
import re
from imgsPro.items import ImgsproItem
class ImgproSpider(scrapy.Spider):
name = 'imgPro'
# allowed_domains = ['n.monster']
start_urls = ['https://vpton.monster/comic/2497']
def parse(self, response):
li_list=response.xpath('//ul[@class="mh-list col7"]/li')
for li in li_list:
name = li.xpath('./div/a/@title').extract_first()
src=li.xpath('./div/a/p/@style').extract_first()
# background-image: url(https://atic/upload/book/2488/cb5d29544077691b4c68b502caa096fe.jpg)
# 使用正则表达式将src中的地址抽取出来
ex_content = 'url\((.*?)\)' # 查找数字\将字符进行转义
content=re.findall(ex_content,src,re.M)
item=ImgsproItem()
item['src']=content[0]
yield item
pass
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ImgsproItem(scrapy.Item):
# define the fields for your item here like:
src = scrapy.Field()
pass
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import scrapy
from itemadapter import ItemAdapter
# class ImgsproPipeline:
# def process_item(self, item, spider):
# return item
from scrapy.pipelines.images import ImagesPipeline
class ImgPileLine(ImagesPipeline):
# 根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['src'],meta={'item':item})
def file_path(self, request, response=None, info=None, *, item=None):
imgName=request.url.split('/')[-1]
return imgName
def item_completed(self, results, item, info):
return item # 将item返回给下一个即将被执行的管道类和process_item一致
# 自定义使用的管道类
ITEM_PIPELINES = {
'imgsPro.pipelines.ImgPileLine': 1,
}
# 指定图片存储的目录
IMAGES_STORE = "images"
下载中间件
位置:引擎和下载器之间,作用:批量拦截到整个工程中发起的所有的请求和响应
拦截请求:UA伪装process_request,代理IP:process_exception:return request
拦截响应:篡改响应数据,响应对象
需求怕取网易新闻中心的新闻数据(标题和内容)
在middlewares.py中进行配置中间件
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ZzjproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ZzjproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
User_Agent_List = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; fr-FR) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
]
PROXY_http=[
'182.61.201.201:80'
]
PROXY_https=[
'150.136.178.43:80',
'216.21.18.194:80'
]
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 拦截所有请求
def process_request(self, request, spider):
# 伪装UA
request.headers['User-Agent']=random.choice(self.User_Agent_List)
# 为了验证代理是否生效
request.meta['proxy']='http://'+'182.61.201.201:80'
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 拦截所有响应
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
# 连接发生异常的请求
def process_exception(self, request, exception, spider):
if request.url.split(':')[0]=='http':
request.meta['proxy'] = 'http://'+random.choice(self.PROXY_http)
# 代理
else:
request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
return request # 将修正的请求对象进行重新发送
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
settings.py
# Scrapy settings for zzjPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'zzjPro'
SPIDER_MODULES = ['zzjPro.spiders']
NEWSPIDER_MODULE = 'zzjPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'zzjPro.middlewares.ZzjproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'zzjPro.middlewares.ZzjproDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'zzjPro.pipelines.ZzjproPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
练习
爬取网易新闻的数据
import scrapy
from selenium import webdriver
from wangyiPro.items import WangyiproItem
class WangproSpider(scrapy.Spider):
# 实例化一个爬虫类
def __init__(self):
self.bro=webdriver.Edge(executable_path=r'C:\Users\sj176\Downloads\edgedriver_win64\msedgedriver.exe')
name = 'wangPro'
# allowed_domains = ['news.163.com']
start_urls = ['https://news.163.com/domestic/']
models_urls = [] # 存储详情页的url
# 解析五大板块的url
def parse(self, response):
li_list=response.xpath('//div[@class="ns_area list"]/ul/li')
all_list=[1,2,4,5]
for li in all_list:
mode_url=li_list[li].xpath('./a/@href').extract_first()
self.models_urls.append(mode_url)
# 依次对每个板块对应的页面进行请求
for url in self.models_urls:
yield scrapy.Request(url,callback=self.parse_model)
pass
# 每一个板块对应的新闻标题相关内容都是动态加载
def parse_model(self,response): # 解析详情页的url
dis_lis=response.xpath('//div[@class="data_row news_article clearfix "]/a')
for div in dis_lis:
title=div.xpath('./img/@alt').extract_first()
item=WangyiproItem()
item['title']=title
new_detail_url=div.xpath('./@href').extract_first()
print(new_detail_url)
yield scrapy.Request(url=new_detail_url,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
content=response.xpath('//*[@id="content"]/div[2]//text()').extract()
content=''.join(content)
item=response.meta['item']
item['content']=content
yield item
def closed(self,spider):
self.bro.quit()
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class WangyiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content=scrapy.Field()
pass
middlewares.py
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
import time
class WangyiproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class WangyiproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
# 通过改方法拦截五大板块对应的相应对象,进行篡改
def process_response(self, request, response, spider):
# 拦截的是所有的相应对象,只处理五大板块的相应数据,进行挑选
if request.url in spider.models_urls:
# 基于selenium边界的获取动态加载数据
spider.bro.get(request.url) # 队伍看板块进行发送请求
time.sleep(2)
pageText=spider.bro.page_source
# with open('./dd.html','w',encoding='utf-8')as wf:
# wf.write(pageText)
new_response=HtmlResponse(url=request.url,body=pageText,encoding='utf-8',request=request)
return new_response
else:
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class WangyiproPipeline:
def process_item(self, item, spider):
print(item)
return item
settings.py
# Scrapy settings for wangyiPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wangyiPro'
SPIDER_MODULES = ['wangyiPro.spiders']
NEWSPIDER_MODULE = 'wangyiPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wangyiPro.middlewares.WangyiproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wangyiPro.pipelines.WangyiproPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://d.news.sun0769.com/hotline/review.asp?page=1']
# 连接提取器,格局指定规则(allow="正则")进行指定连接的提取 ? 正则表达式会起作用,使用/将他进行转义即可
link=LinkExtractor(allow=r'review.asp\?page=\d+')
rules = (
# 规则解析器,将连接提取器提取到的连接进行指定规则解析(callback)的解析操作
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response)
需求:爬取sun网站中的编号新闻标题,新闻内容,称号
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ygPro.items import YgproItem,DetailItem
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://d.news.sun0769.com/hotline/review.asp?page=1']
# 连接提取器,格局指定规则(allow="正则")进行指定连接的提取 ? 正则表达式会起作用,使用/将他进行转义即可
link=LinkExtractor(allow=r'review.asp\?page=\d+')
link_detail=LinkExtractor(allow=r'/hotline/ygwz_\d+\.htm')
rules = (
# 规则解析器,将连接提取器提取到的连接进行指定规则解析(callback)的解析操作
Rule(link, callback='parse_item', follow=True),
# follow=True 可以将连接提取器继续作用到连接提取器提取到的页面中的连接抽取;会产生重复的URL但是没有关系,调度器当中存在过滤器,可以直接过滤重复的url请求;
Rule(link_detail,callback='parse_detail',follow=False)
)
# 如下两个解析方法是不可以实现请求传参的
# 可以一次存储到两个item中
def parse_item(self, response):
#http://news.sun0769.com/hotline/ygwz_462.htm
#http://news.sun0769.com/hotline/ygwz_461.htm
divs=response.xpath('//div[@class="school_photo1"]|//div[class="school_photo2"]')
for div in divs:
name=div.xpath('./div[@class="school_pci2"]/text()').extract_first()
item=YgproItem()
item['name']=name
yield item
# 解析新闻内容和新闻编号
def parse_detail(self,response):
new_id=response.xpath('//span[@class="txt16_b4"]/text()').extract_first()
print(new_id)
item=DetailItem()
item['content']=new_id
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class YgproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
pass
class DetailItem(scrapy.Item):
content = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class YgproPipeline:
def process_item(self, item, spider):
# 判断item类型
if item.__class__.__name__=='DetailItem':
print(item['content'])
else:
print(item['name'])
return item
settings.py
# Scrapy settings for ygPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'ygPro'
SPIDER_MODULES = ['ygPro.spiders']
NEWSPIDER_MODULE = 'ygPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
LOG_LEVEL = "ERROR"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'ygPro.middlewares.YgproSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'ygPro.middlewares.YgproDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'ygPro.pipelines.YgproPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
概念:我们需要搭建分布式的集群让分布式的集群对一组资源进行分布联合爬取
作用:提升爬取数据的效率
如何实现分布式?
安装scrapy-redis的组件
原生的scrapy是不可以实现分布式爬虫的,我们必须要scrapy结和scrapy-redis组件一起实现分布式爬虫
为什么原生的scrapy不能实现分布式爬虫?
scrapy-redis组件的作用
实现流程
# 指定scrapy_redis的管道,和调度器
ITEM_PIPELINES={
'scrapy_redis.pipelines.RedisPipeline':400
}
# 过滤器
DUPEFILTER_CLASS='scrapy_redis.dupefilter.RFPDupeFilter'
#使用scrapy_redis自己的调度器
SCHEDULER='scrapy_redis.scheduler.Scheduler'
#配置调度器是否要持久化,也就是当爬虫结束了,要不要清空Redis中请求队列和去重指纹的set如果是True的是要
SCHEDULER_PERSIST=True
指定redis服务器:
REDIS_HOST = 'redis服务的ip地址'
REDIS_PORT = 6379
# IF YOU ARE SURE YOU WANT YOUR INSTANCE TO LISTEN TO ALL THE INTERFACES
# JUST COMMENT THE FOLLOWING LINE.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# bind 127.0.0.1
执行过程中出现错误解决方案上方导报修改为
from collections.abc import Iterable
# 必须重写该方法不然会默认调用系统的而系统的回因python版本不一样导致出现问题
def make_requests_from_url(self, url):
yield scrapy.Request(url=url)
项目地址:
fbs.py
import scrapy
from collections.abc import Iterable
from scrapy.linkextractors import LinkExtractor
import sys,os
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
from scrapy.spiders import CrawlSpider, Rule
from fbsPro.items import FbsproItem
from scrapy_redis.spiders import RedisCrawlSpider
# 第一步导报
class FbsSpider(RedisCrawlSpider):
name = 'fbs'
# allowed_domains = ['www.xxx.com']
# start_urls = ['http://www.xxx.com/']
# 第二部注释上方请求列表
redis_key = 'sun'
# 重写父类的属性,设置redis的key
rules = (
Rule(LinkExtractor(allow=r'review.asp\?page=\d+'), callback='parse_item', follow=True),
)
# lpush sun http://d.news.sun0769.com/hotline/review.asp?page=1
# 必须重写该方法不然会默认调用系统的而系统的回因python版本不一样导致出现问题
def make_requests_from_url(self, url):
yield scrapy.Request(url=url)
def parse_item(self, response):
# http://news.sun0769.com/hotline/ygwz_462.htm
# http://news.sun0769.com/hotline/ygwz_461.htm
divs = response.xpath('//div[@class="school_photo1"]|//div[class="school_photo2"]')
for div in divs:
name = div.xpath('./div[@class="school_pci3_2"]//text()').extract_first()
new_num=div.xpath('./div[@class="school_pci3_2"]/a/@href').extract_first()
item = FbsproItem()
item['name'] = name
item['new_num']=new_num
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FbsproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
new_num=scrapy.Field()
pass
settings.py
# Scrapy settings for fbsPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'fbsPro'
SPIDER_MODULES = ['fbsPro.spiders']
NEWSPIDER_MODULE = 'fbsPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36 Edg/103.0.1264.62'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'fbsPro.middlewares.FbsproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'fbsPro.middlewares.FbsproDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'fbsPro.pipelines.FbsproPipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 指定scrapy_redis的管道,和调度器
ITEM_PIPELINES={
'scrapy_redis.pipelines.RedisPipeline':400
}
# 指定调度器
# 过滤器
DUPEFILTER_CLASS='scrapy_redis.dupefilter.RFPDupeFilter'
#使用scrapy_redis自己的调度器
SCHEDULER='scrapy_redis.scheduler.Scheduler'
#配置调度器是否要持久化,也就是当爬虫结束了,要不要清空Redis中请求队列和去重指纹的set如果是True的是要
SCHEDULER_PERSIST=True
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
概念:检测网站数据更新的情况,只会爬取网站最新更新出来的数据.
分析:
演示代码
pbs.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from pbPro.items import PbproItem
class PbsSpider(CrawlSpider):
name = 'pbs'
# allowed_domains = ['www.pianba.tv']
start_urls = ['https://www.pianba.tv/class/6--------1---.html']
link = LinkExtractor(allow=r'/class/6--------(\d+)---.html')
rules = (
Rule(link, callback='parse_item', follow=False),
)
conn = Redis(host='127.0.0.1', port=6379)
def parse_item(self, response):
# 解析详情页的url
lis_li = response.xpath('//ul[@class="stui-vodlist clearfix"]/li')
for li in lis_li:
href = 'https://www.pianba.tv'+li.xpath('./div/a/@href').extract_first()
title = li.xpath('./div/a/@title')
item=PbproItem()
item['name']=title
# 将详情页存储到redis集合中set,进行去除
ex=self.conn.sadd('urls',href)
if ex==1:
print('该url库中没有爬取,可以进行数据爬取')
yield scrapy.Request(url=href,callback=self.parse_cont,meta={'item':item})
else:
print('无数据可爬取')
def parse_cont(self,response):
con_li=response.xpath('//div[@class="stui-content__detail"]//text()').extract()
item=response.meta['item']
item['content']=''.join(con_li)
yield item
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PbproItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
content = scrapy.Field()
pass
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PbproPipeline:
conn=None
def open_spider(self,spider):
self.conn=spider.conn
def process_item(self, item, spider):
dic={
'name':item['name'],
'content':item['content']
}
self.conn.lpush('movieData',str(dic))
return item