爬虫练手,主要运用requests,由于要对script内部进行分析,所以就直接用了 re 正则匹配,平时也可以用用Beautifulsoup, 应该更加方便
思路:
读取首页,就是如 http://newcar.xcar.com.cn/2674/2015/detail/1.htm;为了全部抓取,我们这里都是 1.htm 结尾
找到页面里面的 script 标签里面的 " var nextUrl " ,这个这个地址就是页面自动播放的下个页面;同时读出 img src 即 图片地址,然后保存(我这里是把 说明 等内容全部放进 目标文件的名称中了,其实大家只要图片的话完全不需要)
递归抓取 全部页面
这里另外做的一点实际 弄了个xcar_lst 记录所有页面、图片等信息,只是留作记录,暂时没用
上代码:
python
# coding:utf-8 __author__ = 'BONFY CHEN' import requests import re PROXIES = None HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36' , 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' , 'Accept-Encoding': 'gzip,deflate,sdch' , 'Accept-Language': 'zh-CN,zh;q=0.8' } BASE_FOLDER = 'D:/xxx_folder/' class xcarDown(object): _base_folder = None _proxies = None _headers = None _website = 'http://newcar.xcar.com.cn' _xcar_lst = [] def set_base_folder(self, base_folder): self._base_folder = base_folder def set_headers(self, headers): self._headers = headers def set_proxies(self, proxies): self._proxies = proxies def __init__(self, base_folder=BASE_FOLDER, proxies=PROXIES, headers=HEADERS): self.set_base_folder(base_folder) self.set_headers(headers) self.set_proxies(proxies) def download_image_from_url(self, url, name=None): """ download_image_from_url :param url: the resource image url :param name: he destination file name :return: """ local_filename = name + '_' + url.split('/')[-1] r = requests.get(url, proxies=self._proxies, headers=self._headers, stream=True) with open(self._base_folder + local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) f.flush() f.close() return local_filename def download_xcar(self, url): """ :param url: the source url in xcar.com.cn http://newcar.xcar.com.cn/2674/2015/detail/1.htm :return: """ r = requests.get(url, proxies=self._proxies, headers=self._headers) # print r.encoding r.encoding = 'gbk' m1 = re.search(r"var nextUrl = '(?P<n_url>.*.htm)'", r.text) next_url = m1.groupdict()['n_url'] if m1 else None m2 = re.search(r"<div class=\"zs_img\"><img src=\"(?P<pic_url>.*.jpg)\"", r.text) pic_url = m2.groupdict()['pic_url'] if m2 else None m3 = re.search(r"<div class=\"zs_t\">(?P<title>.*)</div>", r.text) title = m3.groupdict()['title'] if m3 else '' m4 = re.search(r"<div class=\"zs_c\">(?P<cont>.*)</div>", r.text) cont = m4.groupdict()['cont'] if m4 else '' m5 = re.search(r"<title>(?P<model>.*)</title>", r.text) model = m5.groupdict()['model'] if m5 else '' if pic_url: try: self.download_image_from_url(pic_url, name='_'.join([model, title, cont])) print 'download complete: pic from {} '.format(pic_url) except IOError: print 'file name IOERROR' self.download_image_from_url(pic_url, name=model) print 'download complete: pic from {} '.format(pic_url) except Exception as e: print e dct = dict(pic_url=pic_url, next_url=next_url, title=title, cont=cont, model=model) self._xcar_lst.append(dct) if next_url[-4:] == '.htm': self.download_xcar(self._website + next_url) if __name__ == '__main__': print("Welcome to the Pic Download for xcar.com") print("Downloaded files in the folder: " + BASE_FOLDER ) print("---------------------------------------") id_modell = int(input("Please enter the modell id(eg.2674): ")) year = int(input("Please enter the year (eg.2015): ")) url = 'http://newcar.xcar.com.cn/{}/{}/detail/1.htm'.format(id_modell, year) xcar = xcarDown() xcar.download_xcar(url)
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。