安装Scrapy
pip install Scrapy
创建项目
scrapy startproject tutorial
创建爬虫
在 tutorial/spiders
目录下创建 quotes_spider.py
文件,代码如下:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://segmentfault.com/blog/sown',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('section.stream-list__item'):
print(quote.css('h2.title a::text').extract_first())
print(quote.css('h2.title a::attr(href)').extract_first())
启动前配置
在 settings.py
中添加:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
ROBOTSTXT_OBEY = False
启动项目
scrapy crawl quotes
界面输出DEBUG、INFO的提示信息,还有抓取的文章标题和链接。一个最简单的初级爬虫,基本流程就已经跑通了。
抓取二级页面
quotes_spider.py
:
import urllib
import scrapy
def parse_article(response):
article = response.css('article.article').extract_first()
print(article)
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://segmentfault.com/blog/sown',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('section.stream-list__item'):
print(quote.css('h2.title a::text').extract_first())
article = urllib.parse.urljoin(response.url, quote.css('h2.title a::attr(href)').extract_first())
yield scrapy.Request(
url=article,
callback=parse_article
)
保存数据到MySQL
items.py
:
# -*- coding: utf-8 -*-
import scrapy
class ArticleItem(scrapy.Item):
title = scrapy.Field()
content = scrapy.Field()
pass
pipelines.py
:
# -*- coding: utf-8 -*-
import pymysql as pymysql
from pymysql.cursors import DictCursor
class TutorialPipeline(object):
def process_item(self, item, spider):
return item
class MySQLPipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='127.0.0.1',
port=3306,
db='spider',
user='root',
passwd='root',
charset='utf8',
use_unicode=True)
self.cursor = self.connect.cursor(DictCursor)
def process_item(self, item, spider):
self.cursor.execute(
"""insert into article(
title,
content
) value (%s, %s)""",
(
item['title'],
item['content']
)
)
self.connect.commit()
return item
quotes_spider.py
:
import urllib
import scrapy
from ..items import ArticleItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'https://segmentfault.com/blog/sown',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for quote in response.css('section.stream-list__item'):
title = quote.css('h2.title a::text').extract_first()
article = urllib.parse.urljoin(response.url, quote.css('h2.title a::attr(href)').extract_first())
yield scrapy.Request(
url=article,
callback=self.parse_article,
meta={'title': title}
)
def parse_article(self, response):
title = response.meta['title']
content = response.css('article.article').extract_first()
item = ArticleItem()
item['title'] = title
item['content'] = content
yield item
settings.py
:
ITEM_PIPELINES = {
'tutorial.pipelines.MySQLPipeline': 300
}
通过启动命令,传递 start_url 参数
quotes_spider.py
:
import urllib
import scrapy
from ..items import ArticleItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
def __init__(self, start_url=None, *args, **kwargs):
super(QuotesSpider, self).__init__(*args, **kwargs)
self.start_url = start_url
def start_requests(self):
urls = [
'https://segmentfault.com',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
yield scrapy.Request(
self.start_url,
callback=self.parse_list,
meta={}
)
def parse_list(self, response):
for quote in response.css('section.stream-list__item'):
title = quote.css('h2.title a::text').extract_first()
article = urllib.parse.urljoin(response.url, quote.css('h2.title a::attr(href)').extract_first())
yield scrapy.Request(
url=article,
callback=self.parse_article,
meta={'title': title}
)
def parse_article(self, response):
title = response.meta['title']
content = response.css('article.article').extract_first()
item = ArticleItem()
item['title'] = title
item['content'] = content
yield item
执行:scrapy crawl quotes -a start_url=https://segmentfault.com/blog/sown
可能会遇到的一些问题
- 抓取的内容中存在
<br>
,导致本应返回string
变成list
正常来说使用text=response.css('[id=content]::text').extract()
本应返回全部的文本内容,但是因为内容存在<br>
,所以它会返回<br>
分割后的list
。
这个时候,就要根据实际情况,是合并list
,还是更改匹配规则的策略了。 -
flask
中调用scrapy
,错误提示 “ValueError: signal only works in main thread”
换成下面的调用方式subprocess.run(['scrapy', 'crawl', 'nmzsks', "-a", "year=" + year, "-a", "start_url=" + start_url], shell=True)
-
No module named ArticleItem.items
引用时加上..
from ..items import ArticleItem
-
中文抓取后乱码
可能抓取的页面不是utf-8
编码,scrapy
水土不服,用下面的方式转换一下content.encode('latin1').decode('gbk')
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。