为什么在scrapy里面通过selenium跳转到页面爬下来的url request这些url又会重新回middleware的selenium中进行跳转,而不是回调给下面的def
def parse(self, response):
contents = response.xpath('//*[@id="tbodyID"]/tr')
for content in contents:
watch_url = 'http://mooc1-1.chaoxing.com' + str(content.xpath('td[2]/a/@href').extract_first())
yield scrapy.Request(url=watch_url, callback=self.parse_detail, dont_filter=True)
def parse_detail(self, response):
alls = response.xpath('/html/body/div[3]/div/div[5]/table/tbody/tr')
for all in alls:
item = ChaoxinItem()
item['start_time'] = all.xpath('td[1]/text()').extract_first()
item['watch_time'] = all.xpath('td[2]/text()').extarct_first()
item['computer'] = all.xpath('td[3]/text()').extarct_first()
yield item
class JSPageMiddleware(object):
#通过chrome请求动态网页
def process_request(self, request, spider):
if spider.name == "hxxy":
# browser = webdriver.Chrome(executable_path="D:/Chromedriver/chromedriver.exe")
spider.browser.get(request.url)
time.sleep(10)
spider.browser.find_element_by_xpath('/html/body/div[2]/div[1]/div[1]/div[1]/div/a').click()
spider.browser.switch_to_window(spider.browser.window_handles[1])
time.sleep(1)
spider.browser.switch_to_frame('frame_content')
spider.browser.find_element_by_xpath('/html/body/div/div/div[2]/div[2]/ul/li[1]/div[1]/a').click()
spider.browser.switch_to_window(spider.browser.window_handles[2])
spider.browser.find_element_by_xpath('//*[@id="s3"]/div/ul/li[3]/a').click()
time.sleep(1)
spider.browser.find_element_by_xpath('//*[@id="left"]/div[4]/div[3]/div[1]/div[1]/div/h3/a/span/span[2]').click()
time.sleep(1)
spider.browser.switch_to_window(spider.browser.window_handles[3])
spider.browser.find_element_by_xpath('/html/body/div[3]/div/div[2]/div/table/tbody/tr[1]/td[6]/a').click()
time.sleep(1)
for i in range(7):
button = spider.browser.find_element_by_xpath('//*[@id="moreButton"]')
button.click()
time.sleep(1)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)