一:安装步骤
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 627220E7
echo 'deb http://archive.scrapy.org/ubuntu scrapy main' | sudo tee /etc/apt/sources.list.d/scrapy.list
sudo apt-get update && sudo apt-get install scrapy-0.25
二:执行完毕后 输入scrapy 不报错即可安装成功,然后输入
scrapy startproject TestMore
会在当前目录建立scrapy文件项目,下面让我们制作一个爬虫抓取网页
以http://www.111com.net 为例
TestMore/spiders/spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http import Request
from scrapy.selector import Selector
from doubanmovie.items import DoubanmovieItem
class TestMore(CrawlSpider):
name = "TestMore"
redis_key = 'TestMore:start_urls'
start_urls = ['http://movie.douban.com/top250']
url = 'http://movie.douban.com/top250'
def parse(self,response):
# print response.body
item = DoubanmovieItem()
selector = Selector(response)
Movies = selector.xpath('//div[@class="info"]')
for eachMoive in Movies:
title = eachMoive.xpath('div[@class="hd"]/a/span/text()').extract()
fullTitle = ''
for each in title:
fullTitle += each
movieInfo = eachMoive.xpath('div[@class="bd"]/p/text()').extract()
star = eachMoive.xpath('div[@class="bd"]/div[@class="star"]/span/em/text()').extract()[0]
quote = eachMoive.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
#quote可能为空,因此需要先进行判断
if quote:
quote = quote[0]
else:
quote = ''
item['title'] = fullTitle
item['movieInfo'] = ';'.join(movieInfo)
item['star'] = star
item['quote'] = quote
yield item
nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
#page分页
if nextLink:
nextLink = nextLink[0]
print nextLink
yield Request(self.url + nextLink,callback=self.parse)