看完Scrapy的概述,我们大概对Scrapy这个爬虫框架有了一个大致的了解。下面就通过爬取当当网python书籍信息来对Scrapy这个框架做一个实战。
- 创建一个项目文件夹,并打开终端cd到创建文件夹处。在该目录下,在终端输入:
通过执行这条命令,便会在当前文件夹下创建Scrapy的大致模板了。scrapy startproject (爬虫项目名)
- 进入工程文件夹,在终端输入:
scrapy genspider (爬虫py的名字) (爬取的范围)
- 在items.py中添加item项。
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html #导入第三方库scrapy import scrapy #定义有关需要爬取的数据的类DdpythonItem(当当pythonItem) class DdpythonItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name=scrapy.Field() #书名 author=scrapy.Field() #作者 introduction=scrapy.Field() #简介 price=scrapy.Field() #价格 press=scrapy.Field() #出版社 time=scrapy.Field() #出版时间 comment_num=scrapy.Field() #评价数目
- 进入前面生成的爬虫py文件中,进行爬取方法的定义。
# -*- coding: utf-8 -*- # 导入第三方库scrapy以及自定义库items中的Item类DdpythonItem import scrapy from ..items import DdpythonItem # 定义爬虫类DdSpider(当当Spider),爬虫名字命名为dd class DdSpider(scrapy.Spider): name = 'dd' # 爬取的网站为当当网,开始的url为'http://search.dangdang.com/?key=python'(python类图书界面) allowed_domains = ['dangdang.com'] start_urls = ['http://search.dangdang.com/?key=python'] # 定义解析response对象的方法 def parse(self, response): # 使用 XPath 从页面的HTML源码中选择需要提取的数据 # 根据对HTML源码的观察,设置XPath解析规则提取出所有书籍信息books books = response.xpath('//ul[@class="bigimg"]/li') # 对提取到的所有图书信息进行遍历,提取每一本书的相关信息 for book in books: item = DdpythonItem() # 生成一个Item对象,用于存储提取到的信息 item['name'] = book.xpath('./a[@class="pic"]/@title').extract() item['introduction'] = book.xpath('./p[@class="detail"]/text()').extract() if len(book.xpath('./p[@class="detail"]/text()')) > 0 else '无介绍信息' item['author'] = book.xpath('./p/span[1]/a[1]/@title').extract() if len(book.xpath('./p/span[1]/a[1]/@title')) > 0 else '无作者信息' item['price'] = book.xpath('./p/span[@class="search_now_price"]/text()').extract() item['press'] = book.xpath('./p/span[3]/a/text()').extract() if len(book.xpath('./p/span[3]/a/text()')) > 0 else '无出版社信息' item['time'] = book.xpath('./p[5]/span[2]/text()').extract() if len(book.xpath('./p[5]/span[2]/text()')) > 0 else '无出版时间信息' item['comment_num'] = book.xpath('./p[@class="search_star_line"]/a/text()').extract() # 将提取到的数据提交给pipelines进行保存输出 yield item # 设置爬取的页面数pageNum,爬取10个页面的python图书信息 pageNum = 10 # 根据url的规则进行构建新的url,即page,将page提交给scrapy engine进行处理 for page in range(2, pageNum): page = 'http://search.dangdang.com/?key=python&page_index={}'.format(page) # 对页面page提交request请求,利用自定义的解析方法parse对获取到的页面进行解析 yield scrapy.Request(page, callback=self.parse)
- 在pipelines.py文件中,将item中提取的数据保存下来。
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 导入第三方库pymysql,用于对MySQL数据库进行处理 import pymysql # 定义用于输出数据到MySQL数据库的Pipeline类DdpythonPipeline class DdpythonPipeline(object): # 定义处理item对象的方法process_item def process_item(self, item, spider): # 通过pymysql对本地数据库进行连接,并且指定连接的数据库为“dd” db = pymysql.connect(host="Localhost",user="root",password="cpt200406",db="dd",charset="utf8") cursor = db.cursor() # 获取db的游标对象,用于对事务进行处理 # 根据提取的数据进行再次处理加工,转换成最终存储到数据库中的类型 name = item["name"][0] introduction=item["introduction"][0] author=item["author"][0] price = item["price"][0].strip('¥') # 删除价格前面的货币符¥ press = item["press"][0] time = item["time"][0].strip().strip('/') # 删除时间前面的‘/’ comment_num=item["comment_num"][0].strip('条评论')# 删除原始数据中的汉字‘条评论’ # 对数据库进行插入操作,将提取到的item对象中的数据插入到dd数据库的ddpython数据表中 cursor.execute('INSERT INTO ddpython(book_name,introduction,author,price,press,publication_time,comment_num) VALUES (%s,%s,%s,%s,%s,%s,%s)',(name,introduction,author,price,press,time,comment_num)) # 对事务操作进行提交 db.commit() # 关闭游标对象cursor以及数据库对象db cursor.close() db.close() # 返回item对象 return item
- 在settings.py中,将pipelines的配置打开
# -*- coding: utf-8 -*- # Scrapy settings for DDPython project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'DDPython' SPIDER_MODULES = ['DDPython.spiders'] NEWSPIDER_MODULE = 'DDPython.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'DDPython (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'DDPython.middlewares.DdpythonSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'DDPython.middlewares.DdpythonDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'DDPython.pipelines.DdpythonPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
- 进入工作目录文件夹,输入以下命令启动爬虫:
scrapy crawl (爬虫名)