就是在pipeline开两个管道,在用mongodb 中插入数据,判断去重
如何重复就不下载文件,如果不重复,就插入数据库 并且下载文件
这是用一个下载管道一个数据库插入管道
先查看数据库判断数据是否重复,如果重复,就终止后面管道的运行,如果不重复 就插入数据进入数据库,并且启动下载管道,
from scrapy.pipelines.files import FilesPipeline
from scrapy import Request
from scrapy.conf import settings
import pymongo
class XiaoMiQuanPipeLines(object):
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
sheetname = settings["MONGODB_SHEETNAME"]
client = pymongo.MongoClient(host=host, port=port)
mydb = client[dbname]
self.post = mydb[sheetname]
def process_item(self, item):
url = item['file_url']
name = item['name']
result = self.post.aggregate(
[
{"$group": {"_id": {"url": url, "name": name}}}
]
)
if result:
pass
else:
self.post.insert({"url": url, "name": name})
return item
class DownLoadPipelines(FilesPipeline):
def file_path(self, request, response=None, info=None):
return request.meta.get('filename', '')
def get_media_requests(self, item, info):
file_url = item['file_url']
meta = {'filename': item['name']}
yield Request(url=file_url, meta=meta)
DropItem-官方文档: