Scrapy实战(快速爬取小姐姐)
前言
今天还是老老实实搞点东西吧,然后本周的算法题还没刷呢。
目标网站
分析
ok,明确了这个目标网站,那么接下来是如何分析爬取,我们的目标是爬取N页面
分页
首先点击下一页我们发现这个现象
https://pic.netbian.com/index_2.html
这个网站变了一下,并且是有规律的,所以我们直接提取出模板
https://pic.netbian.com/index_{}.html
获取页面图片
我们发现这个所以的(当前页面)的图片实在那个路径下,并且点击进去
里面还有一层。
那么这里就是图片了。
编码
现在我们大概懂了这个流程,我们就直接编码就行了。
项目结构
就这个非常简单 first就是咱们的爬虫 那么接下来就是干活了。
爬虫编写
import scrapy
from dome.items import DomeItem
class FirstSpider(scrapy.Spider):
name = 'first'
baseUrl="https://pic.netbian.com/index_{}.html"
path_index = 2
max_page = 10 #我们最多爬取10页数据
# allowed_domains = ['www.baidu.com']
start_urls = ['https://pic.netbian.com/index.html'] #首页
domains = "https://pic.netbian.com"
def parse(self, response):
li_imgages = response.xpath('//*[@id="main"]/div[3]/ul/li')
for li in li_imgages:
link_image =self.domains + li.xpath("./a/@href").extract_first()
image_name = li.xpath("./a/@title | ./a/img/@alt").extract_first()
print(image_name)
#在我们这块进入那个子页面
item = DomeItem()
item['image_name'] = image_name
headers = {'referer':link_image}
yield scrapy.Request(link_image,callback=self.parse_image,meta={'item':item},headers=headers)
if(self.path_index <= self.max_page):
url = self.baseUrl.format(self.path_index)
self.path_index+=1
print(url,"爬取完成")
yield scrapy.Request(url,callback=self.parse)
def parse_image(self,response):
item = response.meta['item']
image_src = self.domains + response.xpath('//*[@id="img"]/img/@src').extract_first()
item['image_src'] = image_src
yield item
item编写
这个里的话就两个,图片名称和url
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DomeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_name = scrapy.Field()
image_src = scrapy.Field()
保存与设置UA
之后交给管道下载
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
'''
open_spider
close_spider
return item 传递下一个需要被执行的piple
'''
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class DomePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
#请求图片数据
yield scrapy.Request(item['image_src'])
def file_path(self, request, response=None, info=None, *, item=None):
return item['image_name']+'.jpg'
def item_completed(self, results, item, info):
'传递给下一个,如果你还有一个piple的话'
return item
这里注意在配置文件里面设置保存路径
当然还有headers,我这里就这直接在中间件里面处理了。当然你其实也可以直接传参数
这里是因为这个
这个的话我直接把我的设置文件给出来了
# Scrapy settings for dome project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'dome'
SPIDER_MODULES = ['dome.spiders']
NEWSPIDER_MODULE = 'dome.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
LOG_LEVEL = 'ERROR'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
IMAGES_STORE='./'
USER_AGENT_LIST=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dome.middlewares.DomeSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'dome.middlewares.DomeDownloaderMiddleware': 543,
# }
DOWNLOADER_MIDDLEWARES = {
'lagou.middlewares.LagouDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'dome.pipelines.DomePipeline': 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
然后找到你的中间件
转载自:https://juejin.cn/post/7073440217919111199