本次爬虫爬取了京东的口罩信息,并将数据保存至MongoDB数据库。其中config为配置信息:
MONGO_URL = 'localhost'
MONGO_DB = 'jingdong'
MONGO_TABLE = 'mask'
正文:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from bs4 import BeautifulSoup
from config import *
import pymongo
#在本地创建数据库
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
#创建浏览器对象和显示等待
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
def search(keys):
#用于搜索关键字
try:
browser.get('https://www.jingdong.com/')
safe_button_1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#details-button")))
safe_button_1.click()
safe_button_2 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#proceed-link")))
safe_button_2.click()
search_box = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
search_box.clear()
search_box.send_keys(keys)
button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))
button.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > em:nth-child(1) > b"))).text
get_infomation()
return total
except TimeoutException:
search(keys)
def swich_to_page(page_num):
#用于翻页
try:
num_box = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
next_page_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a")))
num_box.clear()
num_box.send_keys(page_num)
next_page_button.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#J_bottomPage > span.p-num > a.curr'),str(page_num)))
except TimeoutException:
swich_to_page(page_num)
def get_infomation():
#用于解析html获取信息
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#J_goodsList > ul')))
soup = BeautifulSoup(browser.page_source,'lxml')
products = soup.find_all('li',class_='gl-item')
for product in products:
p = {
'price' : product.find('div',class_='p-price').text.strip(),
'name' : product.find('div',class_='p-name').text.strip(),
'comment' : product.find('div',class_='p-commit').text.strip(),
'shop' : product.find('div',class_='p-shop').text.strip(),
'label' : product.find('div',class_='p-icons').text.strip()
}
save_to_mongo(p)
def save_to_mongo(result):
#将信息保存到数据库
try:
if db[MONGO_TABLE].insert_one(result):
print('保存成功',result)
except:
print('存储失败',result)
def main(keys):
try:
total = search(keys)
for i in range(2,int(total)+1):
swich_to_page(i)
get_infomation()
time.sleep(1)
except Exception:
print('出错了')
finally:
browser.close()
if __name__ == '__main__':
main('口罩')
最后可以在数据库中查看爬取到的信息。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。