2017款标致5008价格-2023款起亚k3最新消息


2023年11月23日发(作者:大众宝来大概多少钱)

汽车之家下载?件和图?

# scrapy框架?下载问价和图?

# 判断?件夹和路径是否存在

# 爬??件

import scrapy

from import BmwItem

class Bme5Spider():

name = \'bme5\'

allowed_domains = [\'\']

start_urls = [\'/pic/series/\']

def parse(self, response):

# selectors --> list

uiboxs = (\"//div[@class=\'content\']/div[@class=\'row\']//div[@class=\'uibox\']\")[1:]

for uibox in uiboxs:

category = (\".//div[@class=\'uibox-title\']/a/text()\").get()

# print(category)

urls = (\".//ul/li/a/img/@src\").getall()

# for url in urls:

# url = \"\" + url

# print(url)

urls = list(map(lambda url:\"https:\" + url,urls))

# print(urls)

item = BmwItem(category=category,urls=urls)

yield item

# 管道?件 把图?保存

import os

from urllib import request

class BmwPipeline(object):

def __init__(self):

# 获取当前pipeline?件所在的?录路径 e(__file__)

# 获取最外层bmw的路径e(e(__file__))

# 在最外层bmw?录下创建?个?件夹 images, 获取images的路径

= (e(e(__file__)), \'images\')

if not ():

print(\"images?件夹不存在\")

() # 创建images?件夹

def process_item(self, item, spider):

category = item[\'category\']

urls = item[\'urls\']

category_path = (,category)

if not (category_path):

(category_path)

for url in urls:

# print(url)

image_name = (\"_\")[-1]

rieve(url,(category_path,image_name))

return item

scrapy 为下载item中包含的?件(?如在爬取到产品时也想同时保存对应的图?) 提供了?个可重?的item pipeline

这些pipeline有些共同的?法和结构,我们称之为media pipeline

?般来说有Files pipeline images pipeline

为什么要使?scrapy内置的下载?件的?法?

1.避免重新下载已经下载过的?件或图? (避免图?的重复下载)

2.可以?便的指定?件存储的路径

3.可以将下载的图?转换成通?的格式, ?如png或者jpg

4.可以?便?成缩略图

5.可以?便的检测图?的宽?,确保满?最?限制

6.异步下载 (重要)

# 下载?件的files pipeline

步骤:

1. 定义好?个item,然后在这个?件中定义两个属性,分别是file_urls(?的多)files(?的少), file_urls是?来存储需要下载的?件的url连接,需要?个列表

2. 当?件下载完成后,会把?件下载的相关信息存储到itemfiles属性中, ?如下载路径,下载的url和?件的检验码等

3. 在配置?件settings,配置FILES_STORE,这个配置?来设置?件下载下来的路径

4. 启动pipeline: settings?件中, ITEM_PIPELINES中设置ipeline:1

# 下载图?的images Pipeline

当使?images Pipeline下载?件的时候步骤:

1. 定义好?个item,然后在这个?件中定义两个属性,分别是image_urlsimages, images_urls是?来存储需要下载的?件的url连接,需要?个列表

2. 当?件下载完成后,会把?件下载的相关信息存储到itemfiles属性中, ?如下载路径,下载的url和?件的检验码等

3. 在配置?件settings,配置IMAGES_STORE,这个配置?来设置?件下载下来的路径

4. 启动pipeline: ITEM_PIPELINES中设置Pipeline:1

# 宝马5系图?下载

# 在上?代码的基础上稍微修改, 实现异步爬取

# 先在items ?件中定义两个属性

# items?件

import scrapy

class BmwItem():

# define the fields for your item here like:

category = ()

image_urls = ()

images = ()

pass

# 爬??件

import scrapy

from import BmwItem

class Bme5Spider():

name = \'bme5\'

allowed_domains = [\'\']

start_urls = [\'/pic/series/\']

def parse(self, response):

# selectors --> list

uiboxs = (\"//div[@class=\'content\']/div[@class=\'row\']//div[@class=\'uibox\']\")[1:]

for uibox in uiboxs:

category = (\".//div[@class=\'uibox-title\']/a/text()\").get()

# print(category)

urls = (\".//ul/li/a/img/@src\").getall()

# for url in urls:

# url = \"\" + url

# print(url)

urls = list(map(lambda url:\"https:\" + url,urls))

# print(urls)

item = BmwItem(category=category,image_urls=urls)

yield item

# settings?件 原先的管道?件不再执?了

ITEM_PIPELINES = {

# \'eline\': 300,

\"Pipeline\":1 #不执?管道?件

}

# 图?下载的路径 nes使?

import os

IMAGES_STORE = (e(e(__file__)), \'images\')

# 管道?件部分不?修改

# 执?效率?上?的快很多, 但是这样爬下来的图?没有分类, 都存在?个叫full的?件夹下?了. 现在需要把爬下来的图?进??下分类

# 修改pipeline?件, 重写?个类

-------------------------------------------------------------------------------------------------------------

import os

from urllib import request

from import ImagesPipeline

from bmw import settings

class BMWImagesPipeline(ImagesPipeline):

# 这个?法是发送下载请求之前调?

# 其实这个?法本?就是发送下载请求的

def get_media_requests(self,item,info):

request_objs = super(BMWImagesPipeline, self).get_media_requests(item,info)

for request_obj in request_objs:

request_ = item # item绑定到request上?,为了下?的?法可以通过request获取item

return request_objs

# 这个?法是图?被存储的时候调?,来获取这个图?存储的路径

def file_path(self,request,response=None,info=None):

path = super(BMWImagesPipeline, self).file_path(request,response,info)

# 获取到item, 进?步获取item?的category

category = (\"category\")

# 获取图?的存储路径

images_store = _STORE

# 判断这?有没有?录

category_path = (images_store,category)

if not (category_path):

(category_path)

image_name = e(\"full/\",\"\")

image_path = (category_path,image_name)

return image_path

# 但是现在获取的只是?部分的缩略图

# 现在要获取所有的?清图?

# 对??下缩略图和?清图的地址url

# 缩略图: /cardfs/product/g28/M06/42/A7/t_autohomecar__

# ?清图(?缩略图少了t_): /cardfs/product/g28/M06/42/A7/autohomecar__

# \'更多\'页?的url (#后?部分可以删除)

/pic/series/#pvareaid=2042222

/pic/series/#pvareaid=2042222

/pic/series/#pvareaid=2042222

# 对?\'更多\'url 的规律, 等会使?CrawlSpider

/pic/series/

/pic/series/

/pic/series/

# 随便选?个\'更多\', 看看??第?页的url, 找规律

/pic/series/

/pic/series/

/pic/series/

/pic/series/

# 获取?清图?

# 类继承的时候 ?CrawlSpider不?

from s import CrawlSpider,Rule

from tractors import LinkExtractor

from import BmwItem

class Bme5Spider(CrawlSpider):

name = \'bme5\'

allowed_domains = [\'\']

start_urls = [\'/pic/series/\']

rules = (

Rule(LinkExtractor(allow=r\'/pic/series/65.+\'),

callback=\"parse_page\",follow=True),

)

def parse_page(self, response): # 获取?清图

category = (\'//div[@class=\"uibox\"]/div[1]/text()\').get()

print(category)

srcs = (\'//div[@class=\"uibox\"]/div[2]/ul/li/a/img/@src\').getall()

# for src in srcs: # src 缩略图的链接, t_去掉获得?清图链接

# print(\"https\"+src) #///upload/2012/8/22/t_

srcs = list(map(lambda src:e(\"t_\",\"\"),srcs))

# 获得?清图链接列表

srcs = list(map(lambda x: \"https:\" + x, srcs))

item = BmwItem(category=category,image_urls=srcs)

yield item

patrol尼桑途乐2020价格-双环小贵族


更多推荐

宝马5系图片大全