I recently learned a relatively good e-commerce project. The project author provided complete sample data, including product information and pictures, but these pictures are fixed URLs, the product details are html, there are img tags in html, and img tags are in There are also urls. According to past experience, this kind of online CDN is easy to hang up, so the idea of extracting the product pictures in the product data and putting them in their own Tencent cloud server came up to ensure accessibility.
demo data
[{
"ID": "b93e59e214fc4478ac72652a2c87fe54",
"GOODS_SERIAL_NUMBER": "2300000059885",
"SHOP_ID": "402880e860166f3c0160167897d60002",
"SUB_ID": "402880e86016d1b5016016dcd7c50004",
"GOOD_TYPE": 1,
"STATE": 0,
"IS_DELETE": 1,
"NAME": "云南红提800g/盒",
"ORI_PRICE": 18,
"PRESENT_PRICE": 15,
"AMOUNT": 10000,
"DETAIL": "<img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_9395.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_3391.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_7603.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_4718.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_778.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_2602.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_7913.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_202.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_4296.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_6956.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112030_8200.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112031_3967.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112031_5114.jpg\" width=\"100%\" height=\"auto\" alt=\"\" />",
"BRIEF": null,
"SALES_COUNT": 0,
"IMAGE1": "http://images.koow.cc/shopGoodsImg/20171225/20171225112020_561.jpg",
"IMAGE2": null,
"IMAGE3": null,
"IMAGE4": null,
"IMAGE5": null,
"ORIGIN_PLACE": null,
"GOOD_SCENT": null,
"CREATE_TIME": 1514172047397,
"UPDATE_TIME": 1522037064430,
"IS_RECOMMEND": 0,
"PICTURE_COMPERSS_PATH": "http://images.koow.cc/compressedPic/20171225112020_561.jpg"
},
{
"ID": "e0ab2f6e2802443ba117b1146cf85fee",
"GOODS_SERIAL_NUMBER": "4894375014863",
"SHOP_ID": "402880e860166f3c0160167897d60002",
"SUB_ID": "2c9f6c94609a62be0160a02d1dc20021",
"GOOD_TYPE": 1,
"STATE": 0,
"IS_DELETE": 1,
"NAME": "菓子町园道乳酸菌味夹心饼干(抹茶味)540/罐",
"ORI_PRICE": 29.8,
"PRESENT_PRICE": 29.8,
"AMOUNT": 10000,
"DETAIL": "<img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110655_230.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_329.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_2659.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_9521.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_8611.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_1390.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110656_7291.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_3919.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_2170.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_4402.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_1926.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_9438.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_4361.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110657_2730.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_314.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_8779.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_9878.jpg\" width=\"100%\" height=\"auto\" alt=\"\" /><img src=\"http://images.koow.cc/shopGoodsDetailImg/20180213/20180213110658_3471.jpg\" width=\"100%\" height=\"auto\" alt=\"\" />",
"BRIEF": null,
"SALES_COUNT": 0,
"IMAGE1": "http://images.koow.cc/shopGoodsImg/20180213/20180213110648_2744.jpg",
"IMAGE2": null,
"IMAGE3": null,
"IMAGE4": null,
"IMAGE5": null,
"ORIGIN_PLACE": null,
"GOOD_SCENT": null,
"CREATE_TIME": 1518491222336,
"UPDATE_TIME": 1523174099461,
"IS_RECOMMEND": 0,
"PICTURE_COMPERSS_PATH": "http://images.koow.cc/compressedPic/20180213110648_2744.jpg"
}]
As you can see, the data is relatively complete, including ID, serial number, name, price, introduction and other information.
If you want to extract the image URL in the JSON object, it is easier to handle the images1-images5 objects, just traverse it. For the image URL in DETAIL, because the URL is mixed in html, there is no way to get it directly, but it can be obtained in the form of regular matching. Follow the steps below:
Extract image URLs in IMAGE1-IMAGE5
const fs = require("fs");
fs.readFile("./goods_demo.json", "utf8", (err, data) => {
// 序列化数据
data = JSON.parse(data);
data.map((value, index) => {
for (let i = 0; i < 5; i++) {
// 遍历数据,并写入到名为result.txt的文件中
if (value[`IMAGE${i + 1}`] !== null) {
const url = value[`IMAGE${i + 1}`]
fs.appendFile("./result.txt",`\r\n${url}`, function(err) {
if (err) console.log("写文件操作失败");
else console.log("写文件操作成功");
});
}
}
});
});
After running the above code with NodeJS, the URL in the IMAGE object can be correctly read and written to the result.txt file.
Extract the image URL in the DETAIL object
Analysis of the url address shows that the image URL includes the beginning of http (part1) , the URL of the CDN (part2) , the directory where the image is located (part3) , and the name of the image (part4) :
"http://(part1)images.koow.cc(part2)/shopGoodsImg(part3)/20171225(part3)/20171225112020_561.jpg(part4)"
According to the above regular rules, the following regular rules can be used to match!
// \w表示任意字母数字或下划线
// url中的/符号需要转义
// {2,5}表示出现2-5次
// /g表示全局匹配
const urlReg = /http\:\/\/images.koow.cc(\/\w+){2,5}\.jpg/g;
After adding the code for processing the DETAIL object in JSON, the overall code is as follows:
const fs = require("fs");
fs.readFile("./goods_demo.json", "utf8", (err, data) => {
data = JSON.parse(data);
data.map((value, index) => {
if (value.DETAIL) {
// 匹配图片的正则表达式
const urlReg = /http\:\/\/images.koow.cc(\/\w+){2,5}\.jpg/g;
const arrlist = value.DETAIL.match(urlReg);
// 对匹配到的image list遍历并写入文件
if (arrlist && arrlist.length) {
arrlist.map(item => {
fs.appendFile("./result.txt", `\r\n${item}`, function(err) {
if (err) console.log("写DETAIL记录操作失败");
else console.log("写DETAIL记录操作成功");
});
});
}
}
for (let i = 0; i < 5; i++) {
if (value[`IMAGE${i + 1}`] !== null) {
const url = value[`IMAGE${i + 1}`]
fs.appendFile("./result.txt",`\r\n${url}`, function(err) {
if (err) console.log("写文件操作失败");
else console.log("写文件操作成功");
});
}
}
});
});
The final extracted url is stored in reuslt.txt and awaits subsequent processing.
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_9395.jpg
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_3391.jpg
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_4718.jpg
http://images.koow.cc/shopGoodsDetailImg/20171225/20171225112029_7603.jpg
……
Batch download
If you want to be a private CDN server, the storage path of the file cannot be changed, otherwise it will not match the path stored in the database. How to keep the directory of pictures unchanged when downloading in batches? It's easy, just use the wget command:
wget -nc -r -i ./result.txt
-nc, --no-clobber don't overwrite existing files
-r, --recursive download recursively, download all files
-i, --input-file download the URL in the specified file
Summarize
Processing JSON or XML data is an essential skill for programmers. Mastering efficient data processing methods can make work more efficient and avoid unnecessary time overhead. The author's purpose in writing this article is to help friends who have the same needs, and also hope that you can share your data processing skills next to the computer!
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。