node爬虫抓取数据丢失问题?

我用cheerio分析页面,然后用fs.appendFile 将分析的内容写入TXT,做的一个简单的爬虫,全部跑下来最后发现数据大概丢失了四份之一还多,但是单独分析某个丢失区域的时候,发现可以抓取到,是我的代码写的有问题还是哪里进程有问题?自己看不明白了

var fs = require("fs");
var request = require('request');
var cheerio = require('cheerio');
var requrl = 'http://www.ruyile.com/xxlb.aspx?id=1&t=2';
var temp = '';
request(requrl, function(error, response, body) {
    if (!error && response.statusCode == 200) {
        acquireData(body);
    }
});

function acquireData(data) {
    var $ = cheerio.load(data);
    var province = $('.qylb').toArray();
    for (var i = 0; i < 31; i++) {
        var provinceData = province[0].children[i].children[0].data + ',';
        var provinceUrl = province[0].children[i].attribs.href;
        provinceQuery(provinceUrl, provinceData);
    }
}

function provinceQuery(provinceUrl, provinceData) {
    request(provinceUrl, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            cityData(body, provinceData);
        }
    })
}

function cityData(data1, cData) {
    var $ = cheerio.load(data1);
    var city = $('.qylb').toArray();
    if (city[1].children.length > 0) {
        for (var i = 0; i < city[1].children.length; i++) {
            var cityHref = city[1].children[i].attribs.href;
            var transform = cData + city[1].children[i].children[0].data + ',';
            cityQuery(cityHref, transform);
        }
    }
}

function cityQuery(cityHref, trans) {
    request(cityHref, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            districtData(body, trans);
        }
    })
}

function districtData(data2, dData) {
    var $ = cheerio.load(data2);
    var city = $('.qylb').toArray();
    if (city[2]) {
        if (city[2].children.length > 0) {
            for (var j = 0; j < city[2].children.length; j++) {
                var district = city[2].children[j].children[0].data + ',' || ' ,';
                var transf = dData + district;
                var districtHref = city[2].children[j].attribs.href;
                districtQuery(districtHref, transf);
            }
        }
    }
}

function districtQuery(dishref, tran) {
    request(dishref, function(error, response, body) {
        if (!error && response.statusCode == 200) {
            streetData(body, tran);
        }
    })
}

function streetData(data3, sData) {
    var $ = cheerio.load(data3);
    var add = $('.xxlb .sk h4').toArray();
    //console.log(sData);
    if (add.length > 0) {
        for (var i = 0; i < add.length; i++) {
            var schoolNmae = '暂无';
            var tel = '暂无';
            var postal = '暂无';
            var dis = '暂无';
            if (add[i].children[0].children[0]) {
                schoolNmae = add[i].children[0].children[0].data;
                if (add[i].next) {
                    if (add[i].next.data) {
                        tel = (add[i].next.data).replace(/^.+:/, '');
                    }
                    if (add[i].next.next && add[i].next.next.next && add[i].next.next.next.data) {
                        if (add[i].next.next.next.data.indexOf('邮编') > -1) {
                            postal = (add[i].next.next.next.data).replace(/^.+:/, '');
                            if (add[i].next.next.next.next.next && add[i].next.next.next.next.next.data) {
                                dis = (add[i].next.next.next.next.next.data).replace(/^.+:/, '');
                            }
                        } else {
                            dis = (add[i].next.next.next.data).replace(/^.+:/, '');
                        }
                    }
                }
            }
            temp = sData + schoolNmae + ',' + tel + ',' + postal + ',' + dis + '\n';
            xxre(temp);
        }
    }

}

function xxre(temp) {
    fs.appendFile('wx.txt', temp, 'utf8', function(err) {
        if (err) {
            console.log(err);
        }
    });
}
阅读 3.3k
2 个回答
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题