先放代码吧
# coding=utf-8
from bs4 import BeautifulSoup
import requests
from time import sleep
#设置城市
City_Name = 'qd'
page = 'http://newhouse.{0}.fang.com/house/s'.format(City_Name)
#定义Download_Newitem_List()函数是为了提取指定城市楼盘列表的链接,并存放到指定的文件中
def Download_Newitem_List(url,try_num=2):
global City_Name
print('正在下载:',url)
try:
all_html = requests.get(url,timeout=10)
except Exception as e:
print('下载错误:',e.reason)
all_html=None
if try_num >0:
if hasattr(e,'code') and 500 <= e.code <600:
return Download_Newitem_List(url,try_num-1)
all_html.encoding = "gb18030"
soup = BeautifulSoup(all_html.text, "html5lib")
#提取新楼盘项目总数量
Item_Total = soup.find('a', id="allUrl").find('span').text.replace('(','').replace(')','')
#如果余数大于0那么总项目数整除每页20项目数+1
if (int(Item_Total) % 20)>0:
Page_Num = (int(Item_Total) // 20) + 1
else:
Page_Num = (int(Item_Total) // 20)
with open('{0}_list_link.txt'.format(City_Name), 'w',encoding='utf-8') as f:
for i in range(1, Page_Num + 1):
New_Page_Link = 'http://newhouse.{0}.fang.com/house/s/b9{1}'.format(City_Name, i)
print(New_Page_Link)
print(New_Page_Link, file=f)
#定义Download_item_link(City)函数是为了提取指定城市列表的链接中每一个开盘项目的链接,并存放到指定的文件中。
def Download_item_link(City):
with open('{0}_list_link.txt'.format(City), 'r',encoding='utf-8') as f:
#print(f.readlines())
for line in f.readlines():
print('正在读取:', line)
sleep(2)
try:
all_html = requests.get(line,timeout=10)
all_html.encoding = "gb18030"
#print(all_html.text)
except Exception as e:
print('下载错误:', e)
#if try_num > 0:
# if hasattr(e, 'code') and 500 <= e.code < 600:
# return Download_Newitem_List(url, try_num - 1)
soup = BeautifulSoup(all_html.text, "html5lib")
master_html=soup.find_all('div', class_='nlcd_name')
with open('{0}_Newall_link.txt'.format(City), 'w',encoding='utf-8') as d:
for link in master_html:
#print(link.get_text().rstrip() + ':' + link.a['href'].rstrip())
print(link.a['href'].rstrip(),file=d)
Download_Newitem_List(page)
Download_item_link('qd')
上面这段代码放到ide中可以直接运行
拿青岛为例(qd)我提取了总楼盘项目482个,25页的页面链接也都提取出来了,但是当我通过Download_item_link()这个函数提取列表中每个项目链接的时候就出现问题了,按理说qd_Newall_link.txt文件中应该有482个链接才对,但是怎么弄都是20个链接。琢磨了半天都没有搞清楚问题出在哪里。
希望哪位大神帮忙看看。
Download_item_link
中的open
的w
改成a
将
requests.get(line)
改为requests.get(line.strip())