Scrapy爬虫实战--当当网

python 爬虫实战

python

发布日期: 2020-07-10

文章字数: 1.5k

阅读时长: 7 分

阅读次数:

看完Scrapy的概述，我们大概对Scrapy这个爬虫框架有了一个大致的了解。下面就通过爬取当当网python书籍信息来对Scrapy这个框架做一个实战。

创建一个项目文件夹，并打开终端cd到创建文件夹处。在该目录下，在终端输入：
```
scrapy startproject (爬虫项目名)
```
通过执行这条命令，便会在当前文件夹下创建Scrapy的大致模板了。

进入工程文件夹，在终端输入：

scrapy genspider (爬虫py的名字) （爬取的范围）

在items.py中添加item项。

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

#导入第三方库scrapy
import scrapy

#定义有关需要爬取的数据的类DdpythonItem（当当pythonItem）
class DdpythonItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    name=scrapy.Field()             #书名
    author=scrapy.Field()           #作者
    introduction=scrapy.Field()     #简介
    price=scrapy.Field()            #价格
    press=scrapy.Field()            #出版社
    time=scrapy.Field()             #出版时间
    comment_num=scrapy.Field()      #评价数目

进入前面生成的爬虫py文件中，进行爬取方法的定义。

# -*- coding: utf-8 -*-

# 导入第三方库scrapy以及自定义库items中的Item类DdpythonItem
import scrapy
from ..items import DdpythonItem

# 定义爬虫类DdSpider（当当Spider），爬虫名字命名为dd
class DdSpider(scrapy.Spider):
    name = 'dd'
    # 爬取的网站为当当网，开始的url为'http://search.dangdang.com/?key=python'（python类图书界面）
    allowed_domains = ['dangdang.com']
    start_urls = ['http://search.dangdang.com/?key=python']

    # 定义解析response对象的方法
    def parse(self, response):
        # 使用 XPath 从页面的HTML源码中选择需要提取的数据
        # 根据对HTML源码的观察，设置XPath解析规则提取出所有书籍信息books
        books = response.xpath('//ul[@class="bigimg"]/li')

        # 对提取到的所有图书信息进行遍历，提取每一本书的相关信息
        for book in books:
            item = DdpythonItem()       # 生成一个Item对象，用于存储提取到的信息
            item['name'] = book.xpath('./a[@class="pic"]/@title').extract()
            item['introduction'] = book.xpath('./p[@class="detail"]/text()').extract() if len(book.xpath('./p[@class="detail"]/text()')) > 0 else '无介绍信息'
            item['author'] = book.xpath('./p/span[1]/a[1]/@title').extract() if len(book.xpath('./p/span[1]/a[1]/@title')) > 0 else '无作者信息'
            item['price'] = book.xpath('./p/span[@class="search_now_price"]/text()').extract()
            item['press'] = book.xpath('./p/span[3]/a/text()').extract() if len(book.xpath('./p/span[3]/a/text()')) > 0 else '无出版社信息'
            item['time'] = book.xpath('./p[5]/span[2]/text()').extract() if len(book.xpath('./p[5]/span[2]/text()')) > 0 else '无出版时间信息'
            item['comment_num'] = book.xpath('./p[@class="search_star_line"]/a/text()').extract()

            # 将提取到的数据提交给pipelines进行保存输出
            yield item

        # 设置爬取的页面数pageNum，爬取10个页面的python图书信息
        pageNum = 10
        # 根据url的规则进行构建新的url，即page，将page提交给scrapy engine进行处理
        for page in range(2, pageNum):
            page = 'http://search.dangdang.com/?key=python&page_index={}'.format(page)

            # 对页面page提交request请求，利用自定义的解析方法parse对获取到的页面进行解析
            yield scrapy.Request(page, callback=self.parse)

在pipelines.py文件中，将item中提取的数据保存下来。

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

# 导入第三方库pymysql，用于对MySQL数据库进行处理
import pymysql

# 定义用于输出数据到MySQL数据库的Pipeline类DdpythonPipeline
class DdpythonPipeline(object):
    # 定义处理item对象的方法process_item
    def process_item(self, item, spider):
        # 通过pymysql对本地数据库进行连接，并且指定连接的数据库为“dd”
        db = pymysql.connect(host="Localhost",user="root",password="cpt200406",db="dd",charset="utf8")
        cursor = db.cursor()    # 获取db的游标对象，用于对事务进行处理

        # 根据提取的数据进行再次处理加工，转换成最终存储到数据库中的类型
        name = item["name"][0]
        introduction=item["introduction"][0]
        author=item["author"][0]
        price = item["price"][0].strip('¥')     # 删除价格前面的货币符￥
        press = item["press"][0]
        time = item["time"][0].strip().strip('/')   # 删除时间前面的‘/’
        comment_num=item["comment_num"][0].strip('条评论')# 删除原始数据中的汉字‘条评论’

        # 对数据库进行插入操作，将提取到的item对象中的数据插入到dd数据库的ddpython数据表中
        cursor.execute('INSERT INTO ddpython(book_name,introduction,author,price,press,publication_time,comment_num) VALUES (%s,%s,%s,%s,%s,%s,%s)',(name,introduction,author,price,press,time,comment_num))
        # 对事务操作进行提交
        db.commit()

        # 关闭游标对象cursor以及数据库对象db
        cursor.close()
        db.close()

        # 返回item对象
        return item

在settings.py中,将pipelines的配置打开

# -*- coding: utf-8 -*-

# Scrapy settings for DDPython project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'DDPython'

SPIDER_MODULES = ['DDPython.spiders']
NEWSPIDER_MODULE = 'DDPython.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'DDPython (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'DDPython.middlewares.DdpythonSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'DDPython.middlewares.DdpythonDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'DDPython.pipelines.DdpythonPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'