说明
爬取小米有品:
把两个链接放到了一起,运行一次就可以全部获取(约700)
使用的是selenium+chrome+lxml的组合
(也很快,因为就一个页面)
输出:
程序会生成三个文件,两个csv和一个xls
csv体积小巧,通用性强
data_mi.csv使用utf-8编码
data_mi-gbk.csv使用gbk编码
xls就是excel的格式
(gbk是中文编码,可以只用excel打开,utf-8是python默认编码,可由专业工具打开)
贴上代码
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from lxml import etree
import csv
import xlwt
import time
class Spider:
def __init__(self):
self.runtime = None
self.url = [
'https://www.xiaomiyoupin.com/goodsbycategory?firstId=115&secondId=115&title=%E5%AE%B6%E7%94%A8%E7%94%B5%E5%99%A8&spmref=YouPinPC.$Home$.list.0.90827029',
'https://www.xiaomiyoupin.com/goodsbycategory?firstId=116&secondId=116&title=%E6%99%BA%E8%83%BD%E5%AE%B6%E5%BA%AD&spmref=YouPinPC.$Home$.list.0.93586205'
]
self.csvfilename = 'data_mi.csv'
self.csvfilenamegbk= 'data_mi-gbk.csv'
chrome_options = Options()
chrome_options.add_argument('--headless') #设置chrome无头模式
self.browser = webdriver.Chrome(chrome_options=chrome_options)
self.wait = WebDriverWait(self.browser, 20)
def run(self):
'''
运行入口
'''
start = time.time()
#8.2号的链接
for item in self.parse_page(self.get_page(self.url[0])):
self.save_data(item)
#8.1号的链接
for item in self.parse_page(self.get_page(self.url[1])):
self.save_data(item)
self.u8togbk(self.csvfilename,self.csvfilenamegbk)
end = time.time()
self.runtime = end - start
def get_page(self, url):
'''
请求网页
'''
self.browser.get(url)
self.wait.until(EC.presence_of_element_located(
(By.XPATH, '//*[@id="root"]/div/div[3]/div/div[2]/div/div[1]/div[1]/img')))
# 模拟下拉
for i in range(50):
js_to_buttom = "window.scrollBy(0,1000)"
self.browser.execute_script(js_to_buttom)
time.sleep(0.05)
# 等待网页加载
time.sleep(5)
return self.browser.page_source
def parse_page(self, text):
'''
解析网页
'''
html = etree.HTML(text)
for index in range(2, 17):
classes = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/h2/text()'.format(index))[0]
names = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[1]/text()'.format(index))
introduces = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[2]/text()'.format(index))
prices = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[3]/span[2]/text()'.format(index))
imgs = html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/div[1]/img/@src'.format(index))
if len(names) != len(introduces) != len(prices) != len(imgs):
raise Exception
print(len(names),len(introduces),len(prices),len(imgs))
for i in range(len(names)):
yield [classes, names[i], introduces[i], prices[i], imgs[i]]
def save_data(self, item):
'''
保存文件
'''
with open(self.csvfilename, 'a', encoding='utf-8', newline='') as csvfile:
print('item >>> ', item)
writer = csv.writer(csvfile)
writer.writerow(item)
def u8togbk(self,infn,outfn):
with open(infn, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
results = list(reader)
with open(outfn, 'w', encoding='gbk', newline='') as f:
writer = csv.writer(f)
for result in results:
try:
writer.writerow(result)
except Exception:
pass
def mkxls(self, out_filename):
'''
csv转换为xls文件
'''
def csv_to_xlsx(csvfile, outfile):
'''
:param csvfile: str
:param outfile: str
:return: None
'''
with open(csvfile) as fc:
r_csv = csv.reader(fc)
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('sheet1') # 创建一个sheet表格
i = 0
j = 0
for line in r_csv:
j = 0
for v in line:
sheet.write(i, j, v)
j = j + 1
i = i + 1
workbook.save(outfile) # 保存Excel
csv_to_xlsx(self.csvfilenamegbk, out_filename)
@property
def time(self):
return '总共用时:{}秒'.format(self.runtime)
if __name__ == '__main__':
spider = Spider()
spider.run() # 运行爬虫
spider.mkxls('data_mi.xls') # 这行讲csv文件转换位xls文件,可由excel打开使用
print(spider.time) # 运行总时间
输出文件列表
输出文件格式
注:网页是utf-8编码的,保存成gbk的话,有些编码不支持,只能舍弃,所以utf-8编码中的内容会比gbk的多大概2-3%左右
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。