我在练习使用 python3.9 + scrapy + splash
来爬取文章的列表及其内容,但是在使用lua_source
来发送 自定义的内容时,出现了以下的错误(splash docker里的):
2020-10-20 11:50:49.420004 [-] Server listening on http://0.0.0.0:8050
2020-10-20 12:15:55.623975 [events] {"path": "/execute", "rendertime": 8.717848777770996, "maxrss": 247656, "load": [0.0, 0.04, 0.01], "fds": 82, "active": 0, "qsize": 0, "_id": 140574281705680, "method": "POST", "timestamp": 1603196155, "user-agent": "Scrapy/2.4.0 (+https://scrapy.org)", "args": {"cookies": [], "headers": {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", "User-Agent": "Scrapy/2.4.0 (+https://scrapy.org)"}, "lua_source": "\nfunction main(splash, args)\n assert(splash:go(args.url))\n assert(splash:wait(args.wait))\n assert(splash:wait(args.wait))\n return splash:html()\nend\n", "page": 1, "timeout": 90, "url": "https://segmentfault.com/blogs?page=1", "wait": 3, "uid": 140574281705680}, "status_code": 200, "client_ip": "172.17.0.1"}
2020-10-20 12:15:55.624471 [-] "172.17.0.1" - - [20/Oct/2020:12:15:55 +0000] "POST /execute HTTP/1.1" 200 78667 "-" "Scrapy/2.4.0 (+https://scrapy.org)"
2020-10-20 12:16:03.121159 [events] {"path": "/execute", "rendertime": 7.355923414230347, "maxrss": 281760, "load": [0.29, 0.1, 0.03], "fds": 73, "active": 0, "qsize": 0, "_id": 140574661640768, "method": "POST", "timestamp": 1603196163, "user-agent": "Scrapy/2.4.0 (+https://scrapy.org)", "args": {"cookies": [], "headers": {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", "Referer": "https://segmentfault.com/blogs?page=1", "User-Agent": "Scrapy/2.4.0 (+https://scrapy.org)"}, "lua_source": "\nfunction main(splash, args)\n assert(splash:go(args.url))\n assert(splash:wait(args.wait))\n assert(splash:wait(args.wait))\n return splash:html()\nend\n", "timeout": 90, "url": "https://segmentfault.com//a/1190000037533517", "wait": 3, "uid": 140574661640768}, "status_code": 200, "client_ip": "172.17.0.1"}
2020-10-20 12:16:03.121436 [-] "172.17.0.1" - - [20/Oct/2020:12:16:02 +0000] "POST /execute HTTP/1.1" 200 144939 "-" "Scrapy/2.4.0 (+https://scrapy.org)"
2020-10-20 12:16:03.274100 [events] {"path": "/execute", "rendertime": 0.01170206069946289, "maxrss": 281760, "load": [0.29, 0.1, 0.03], "fds": 56, "active": 0, "qsize": 0, "_id": 140574661640768, "method": "POST", "timestamp": 1603196163, "user-agent": "Scrapy/2.4.0 (+https://scrapy.org)", "args": {"article_content": "<article class=\"article fmt article-content\" data-id=\"1190000037533517\" data-license=\"cc\"> </p> ...(some very long html nodes)...</article>", "cookies": [], "headers": {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", "User-Agent": "Scrapy/2.4.0 (+https://scrapy.org)"}, "lua_source": "\nfunction main(splash, args)\n splash.images_enabled = false\n js = string.format(\"document.querySelector('textarea#original_content').value=%s\", args.article_content)\n splash:evaljs(js)\n splash:wait(args.wait)\n click_js = string.format(\"document.querySelector('#translate_button').click()\")\n splash:evaljs(click_js)\n splash:wait(args.wait)\n return_js = string.format(\"document.querySelector('textarea#md_content').value\")\n return splash:evaljs(return_js)\nend\n", "url": "http://localhost:8080", "wait": 1, "uid": 140574661640768}, "status_code": 400, "client_ip": "172.17.0.1", "error": {"error": 400, "type": "ScriptError", "description": "Error happened while executing Lua script", "info": {"type": "JS_ERROR", "js_error_type": "SyntaxError", "js_error_message": "Unexpected token '<'", "js_error": "SyntaxError: Unexpected token '<'", "message": "[string \"...\"]:5: JS error: \"SyntaxError: Unexpected token '<'\"", "splash_method": "evaljs", "source": "[string \"...\"]", "line_number": 5, "error": "JS error: \"SyntaxError: Unexpected token '<'\""}}}
我的爬虫入口文件内容:
from scrapy import Spider, Request
from udaskweb.items import UdaskwebItem
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
script = """
function main(splash, args)
assert(splash:go(args.url))
assert(splash:wait(args.wait))
assert(splash:wait(args.wait))
return splash:html()
end
"""
md_script = """
function main(splash, args)
splash.images_enabled = false
js = string.format("document.querySelector('textarea#original_content').value=%s", args.article_content)
splash:evaljs(js)
splash:wait(args.wait)
click_js = string.format("document.querySelector('#translate_button').click()")
splash:evaljs(click_js)
splash:wait(args.wait)
return_js = string.format("document.querySelector('textarea#md_content').value")
return splash:evaljs(return_js)
end
"""
class SegmentSpider(Spider):
name = 'segment'
start_urls = 'xxxx.com'
allowed_domains = ['xxxx.com','localhost']
md_url = 'http://localhost:8080'
start_urls = 'https://xxxx.com/blogs'
start_domains = 'https://xxxx.com'
def start_requests(self):
for page in range(1, self.settings.get('MAX_PAGE') + 1):
url = self.start_urls + "?page=" + str(page)
yield SplashRequest(url, callback=self.parse, endpoint='execute', args={'lua_source': script, 'wait': 3, 'page': page, 'timeout': 90})
def parse(self, response):
item = UdaskwebItem()
articles = response.css("div.blog-stream").xpath(
'section[@class="stream-list__item"]//div[@class="summary"]//h2/a/@href').extract()
item['links'] = articles
detail_url = self.start_domains + "/" + articles[0]
yield SplashRequest(detail_url, meta={"item": item}, callback=self.article_detail, endpoint='execute', args={'lua_source': script, 'wait': 3, 'timeout': 90})
def article_detail(self, response):
item = response.meta["item"]
article = response.css("div.card-body")
article_content = article.xpath(
'.//article[contains(@class, "article-content")]').extract_first()
# 以下这行代码出现问题,我开始以为是传递的article_content内容过长的原因(实际上,这个真的很长),但是我修改了article_content 为'dddddd'测试时,也是出现这样的错误。
yield SplashRequest(self.md_url, callback=self.get_item, endpoint='execute', args={'lua_source': md_script, 'article_content': article_content, 'wait': 1})
def get_item(self, response):
print("=================================")
困扰了好几天了,感谢大家的帮助。