# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import Request
from cosmetics.items import CosmeticsItem
class CosSpider(scrapy.Spider):
name = 'cos'
# allowed_domains = ['www.jd.com']
# start_urls = ['https://search.jd.com/Search?keyword=%E5%8F%A3%E7%BA%A2&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page=1&s=54&click=0']
def start_requests(self):
star_url = 'https://search.jd.com/'
yield Request(url=star_url, callback=self.parse, meta={"data": "0"})
def parse(self, response):
item = CosmeticsItem()
ul_list = response.css('#J_goodsList > ul > li')
page_next = response.css('#J_bottomPage > span.p-num > a.pn-next')
print("ul_list is :::::::", ul_list)
'''
for li in ul_list:
item = CosItemLoader(item=CosItem(), response=li)
with open('rule.json', 'r') as f:
data = json.load(f)
for keys in data:
item.add_xpath(keys, data[keys])
# item.add_xpath('img', './/div[@class="p-img"]/a/img/@href')
# item.add_xpath('price', './/div[@class="p-price"]//i/text()')
# item.add_xpath('name', './/div[@class="p-name p-name-type-2"]//em/text()')
# item.add_xpath('commit_counts', './/div[@class="p-commit"]//a/text()')
# item.add_xpath('shop', './/div[@class="p-shop"]/span/a/text()')
yield item.load_item()
'''
for l in ul_list:
#al = ul.extract()
print("ul is :::::", l.extract())
img = 'https:' + l.xpath('//div[@class="p-img"]/a/img/@src').extract()[0]
price = l.xpath('//div[@class="p-price"]//i/text()').extract()
name = l.xpath('//div[@class="p-name p-name-type-2"]//em/text()').extract()[0].strip(' ')
commits = l.xpath('//div[@class="p-commit"]//a/text()').extract()[0]
shop = l.xpath('//div[@class="p-shop"]/span/a/text()').extract()[0]
for field in item.fields.keys():
item[field] = eval(field)
yield item
if len(page_next) > 0:
yield Request(url=response.url, callback=self.parse, dont_filter=True, meta={"data": "2"})
else:
print("数据爬取完毕")
先贴spider中代码,遍历取出来的li列表,然后用xpath取数据的时候不是跟着遍历一个一个取,而是一次直接把所有的取出来了,这个是怎么回事啊,
为什么不跟着遍历一个一个取,而是一下取完?这种情况怎么解决??求大佬解决啊!!!!!
//div[@class="p-price"]//i/text()').extract()少加了个点