我用cheerio分析页面,然后用fs.appendFile 将分析的内容写入TXT,做的一个简单的爬虫,全部跑下来最后发现数据大概丢失了四份之一还多,但是单独分析某个丢失区域的时候,发现可以抓取到,是我的代码写的有问题还是哪里进程有问题?自己看不明白了
var fs = require("fs");
var request = require('request');
var cheerio = require('cheerio');
var requrl = 'http://www.ruyile.com/xxlb.aspx?id=1&t=2';
var temp = '';
request(requrl, function(error, response, body) {
if (!error && response.statusCode == 200) {
acquireData(body);
}
});
function acquireData(data) {
var $ = cheerio.load(data);
var province = $('.qylb').toArray();
for (var i = 0; i < 31; i++) {
var provinceData = province[0].children[i].children[0].data + ',';
var provinceUrl = province[0].children[i].attribs.href;
provinceQuery(provinceUrl, provinceData);
}
}
function provinceQuery(provinceUrl, provinceData) {
request(provinceUrl, function(error, response, body) {
if (!error && response.statusCode == 200) {
cityData(body, provinceData);
}
})
}
function cityData(data1, cData) {
var $ = cheerio.load(data1);
var city = $('.qylb').toArray();
if (city[1].children.length > 0) {
for (var i = 0; i < city[1].children.length; i++) {
var cityHref = city[1].children[i].attribs.href;
var transform = cData + city[1].children[i].children[0].data + ',';
cityQuery(cityHref, transform);
}
}
}
function cityQuery(cityHref, trans) {
request(cityHref, function(error, response, body) {
if (!error && response.statusCode == 200) {
districtData(body, trans);
}
})
}
function districtData(data2, dData) {
var $ = cheerio.load(data2);
var city = $('.qylb').toArray();
if (city[2]) {
if (city[2].children.length > 0) {
for (var j = 0; j < city[2].children.length; j++) {
var district = city[2].children[j].children[0].data + ',' || ' ,';
var transf = dData + district;
var districtHref = city[2].children[j].attribs.href;
districtQuery(districtHref, transf);
}
}
}
}
function districtQuery(dishref, tran) {
request(dishref, function(error, response, body) {
if (!error && response.statusCode == 200) {
streetData(body, tran);
}
})
}
function streetData(data3, sData) {
var $ = cheerio.load(data3);
var add = $('.xxlb .sk h4').toArray();
//console.log(sData);
if (add.length > 0) {
for (var i = 0; i < add.length; i++) {
var schoolNmae = '暂无';
var tel = '暂无';
var postal = '暂无';
var dis = '暂无';
if (add[i].children[0].children[0]) {
schoolNmae = add[i].children[0].children[0].data;
if (add[i].next) {
if (add[i].next.data) {
tel = (add[i].next.data).replace(/^.+:/, '');
}
if (add[i].next.next && add[i].next.next.next && add[i].next.next.next.data) {
if (add[i].next.next.next.data.indexOf('邮编') > -1) {
postal = (add[i].next.next.next.data).replace(/^.+:/, '');
if (add[i].next.next.next.next.next && add[i].next.next.next.next.next.data) {
dis = (add[i].next.next.next.next.next.data).replace(/^.+:/, '');
}
} else {
dis = (add[i].next.next.next.data).replace(/^.+:/, '');
}
}
}
}
temp = sData + schoolNmae + ',' + tel + ',' + postal + ',' + dis + '\n';
xxre(temp);
}
}
}
function xxre(temp) {
fs.appendFile('wx.txt', temp, 'utf8', function(err) {
if (err) {
console.log(err);
}
});
}
我看不懂顶顶顶顶顶顶顶顶顶顶顶顶顶顶