spider.py代码
# -*- coding: utf-8 -*-
import scrapy
from topgoods.items import TopgoodsItem
class TmGoodsSpider(scrapy.Spider):
name = "tm_goods"
allowed_domains = ["http://www.tmall.com"]
start_urls = (
'https://list.tmall.com/search_product.htm?q=%C5%AE%D7%B0&type=p&spm=a220m.1000858.a2227oh.d100&from=.list.pc_1_searchbutton',
)
#记录处理的页数
count=0
def parse(self, response):
TmGoodsSpider.count += 1
divs = response.xpath("//div[@id='J_ItemList']/div[@class='product']/div")
if not divs:
self.log( "List Page error--%s"%response.url )
for div in divs:
item=TopgoodsItem()
#商品价格
item["GOODS_PRICE"] = div.xpath("p[@class='productPrice']/em/@title")[0].extract()
#商品名称
item["GOODS_NAME"] = div.xpath("p[@class='productTitle']/a/@title")[0].extract()
#商品连接
pre_goods_url = div.xpath("p[@class='productTitle']/a/@href")[0].extract()
item["GOODS_URL"] = pre_goods_url if "http:" in pre_goods_url else ("http:"+pre_goods_url)
yield scrapy.Request(url=item["GOODS_URL"],meta={'item':item},callback=self.parse_detail,
dont_filter=True)
def parse_detail(self,response):
div = response.xpath('//div[@class="extend"]/ul')
if not div:
self.log( "Detail Page error--%s"%response.url )
item = response.meta['item']
div=div[0]
#店铺名称
item["SHOP_NAME"] = div.xpath("li[1]/div/a/text()")[0].extract()
#店铺连接
item["SHOP_URL"] = div.xpath("li[1]/div/a/@href")[0].extract()
#公司名称
item["COMPANY_NAME"] = div.xpath("li[3]/div/text()")[0].extract().strip()
#公司所在地
item["COMPANY_ADDRESS"] = div.xpath("li[4]/div/text()")[0].extract().strip()
yield item
结果:
10-15 19:20:06 [scrapy] DEBUG: Redirecting (302) to <GET https://login.taob
m/jump?target=https%3A%2F%2Flist.tmall.com%2Fsearch_product.htm%3Ftbpm%3D1%
D%25C5%25AE%25D7%25B0%26type%3Dp%26spm%3Da220m.1000858.a2227oh.d100%26from%
st.pc_1_searchbutton> from <GET https://list.tmall.com/search_product.htm?q
AE%D7%B0&type=p&spm=a220m.1000858.a2227oh.d100&from=.list.pc_1_searchbutton
10-15 19:20:06 [scrapy] DEBUG: Redirecting (302) to <GET https://pass.tmall
add?_tb_token_=KL9DqtpQ4JXA&cookie2=fc1318de70224bfb4688cb59f2166e17&t=4d43
c2cda976f8ace84a7f74a08&target=https%3A%2F%2Flist.tmall.com%2Fsearch_produc
%3Ftbpm%3D1%26q%3D%25C5%25AE%25D7%25B0%26type%3Dp%26spm%3Da220m.1000858.a22
d100%26from%3D.list.pc_1_searchbutton&pacc=RRsp0ixWwD7auxG1xr9HDg==&opi=59.
.222&tmsc=1444908006341549> from <GET https://login.taobao.com/jump?target=
%3A%2F%2Flist.tmall.com%2Fsearch_product.htm%3Ftbpm%3D1%26q%3D%25C5%25AE%25
B0%26type%3Dp%26spm%3Da220m.1000858.a2227oh.d100%26from%3D.list.pc_1_search
n>
好像是网址转移了,需要重定向的问题,请问我要怎么改代码获得我想要的信息。
被跳转到登录页面了,天猫有防爬装置。你仔细研究下天猫detail域下的cookie,把cookie带上去访问吧。