抓取某帖子的图片,内存从一开始不到10MB涨到最后33MB,为什么内存占用会多这么多,已经用了using (){}代码段了,还能怎样优化才能减少内存占用。
代码入口是BtnImageClick(),其中inputTid是用户输入的帖子tid是一串数字
/*
* Created by SharpDevelop.
* User: licre
* Date: 2021/3/18
* Time: 4:58
* 下载nga论坛网页版的图片
* To change this template use Tools | Options | Coding | Edit Standard Headers.
*/
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Windows.Forms;
// 用户
using System.Net;
using System.Text.RegularExpressions;
using System.IO;
using System.Threading;
namespace ngan6
{
/// <summary>
/// Description of MainForm.
/// </summary>
public partial class MainForm : Form
{
static string default_cookie = @"ngaPassportUid=61471889; ngaPassportCid=X947obm5v6mjjorchujde3b6ab2pvbv7vgk8itjo; ";
static string user_agent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192";
public MainForm()
{
//
// The InitializeComponent() call is required for Windows Forms designer support.
//
InitializeComponent();
//
// TODO: Add constructor code after the InitializeComponent() call.
//
}
public string download_json(string tid, int page_num){
string link = "https://bbs.nga.cn/read.php?tid=" + tid + "&lite=js&__output=8&page=" + Convert.ToString(page_num);
string json_body = "";
using (WebClient dl = new WebClient()){
dl.Headers.Add("User-agent", MainForm.user_agent);
dl.Headers.Add("Cookie", MainForm.default_cookie);
json_body = dl.DownloadString(link);
}
if (String.IsNullOrEmpty(json_body)){
throw new Exception("下载json失败");
}
return json_body;
}
/// <summary>
/// 获取帖子基本信息
/// </summary>
/// <param name="html_body">json文本</param>
/// <returns></returns>
public int summarise(string tid, string html_body=""){
if (String.IsNullOrEmpty(html_body)){
html_body = "333replies\":94,\"authorid\"666\"authorid\":594999,\"subject\":\"大哥们护栏多少钱一米啊\",\"type\":33554436,\"fid\"";
// 供测试用
}
Regex reply1 = new Regex("replies\":(\\d+),\"authorid");
Match m1 = reply1.Match(html_body);
if (m1.Groups.Count == 0){
throw new Exception("无法找到帖子回复数");
}
// Regex subject1 = new Regex("subject\":\"(.*?)\",\"type");
// Match m2 = subject1.Match(html_body);
// if (m2.Groups.Count == 0){
// throw new Exception("无法找到帖子主题");
// }
//ti.subject = m2.Groups[1].Value;
// WarnMsg(ti.subject);
int reply_count = Convert.ToInt16(m1.Groups[1].Value);
int page_count = 1 + reply_count / 20;
return page_count;
}
/// <summary>
/// 下载各个页面的json文本
/// </summary>
/// <param name="tid"></param>
/// <param name="last_page_num"></param>
/// <returns></returns>
public string[] scan_pages(string tid){
// 先下载第一页
string page1 = download_json(tid, 1);
int page_count = summarise(tid, page1);
string[] pages = new string[page_count];
pages[0] = page1;
for (int page_num = 2; page_num <= page_count; page_num +=1){
string raw = download_json(tid, page_num);
Thread.Sleep(1000);
int pos1 = raw.IndexOf("\"__R\"");
int pos2 = raw.IndexOf("\"__T\"");
string page = raw.Substring(pos1, pos2-pos1); // 提取回复正文
pages[page_num-1] = page;
}
return pages;
}
/// <summary>
/// 从页面文本提取图片和音频链接
/// </summary>
/// <param name="pages"></param>
/// <returns></returns>
public Dictionary<string, string> refine_image_links(string[] pages){
// 图片名字->图片链接
Dictionary<string, string> images = new Dictionary<string, string>();
//string bb = "初照人?][img]https://img.nga.178.com/attachments/mon_202006/14/-7da9Q5-6dm6Z27T3cS1z4-12s.jpg[/img][/collapsetent\":\"[img]./mon_202103/19/-7da9Q9506-4la9KhT3cSmo-8r.png[/img]<br/>政府工程";
Regex img_re = new Regex(@"\[img\](.*?)\[\/img\]", RegexOptions.Multiline);
foreach (string page in pages){
// 提取回复内容
MatchCollection m = img_re.Matches(page);
for (int r = 0; r < m.Count; r += 1){
string link = m[r].Groups[1].Value;
if (link.Contains("[")){
// 地址最后的[/img]文本被截断,地址中会含有其他bbcode或者中文字符,因此舍弃
continue;
}
string name = link.Substring(link.LastIndexOf("/")+1);
if (images.ContainsKey(name)){ continue; }
if (link.StartsWith("./")){
link = "https://img.nga.178.com/attachments" + link.Substring(1);
}
images.Add(name, link);
}
}
return images;
}
public Dictionary<string, string> refine_audio_links(string[] pages){
Dictionary<string, string> audios = new Dictionary<string, string>();
//string bb = "[flash=audio]./mon_202010/22/-7Q5-ab07Zo.mp3?duration=1′2″[/flash]<br/>拙作半首,列位见笑。";
Regex audio_re = new Regex(@"\[flash=audio\](.*?)\[\/flash\]", RegexOptions.Multiline);
int special = 1;
foreach (string page in pages){
// 提取回复内容
MatchCollection m = audio_re.Matches(page);
for (int r = 0; r < m.Count; r += 1){
int c = 1;
string link = m[r].Groups[1].Value;
if (link.Contains("[")){
// 地址最后的[/flash]文本被截断,地址中会含有其他bbcode或者中文字符,因此舍弃
continue;
}
if (link.Contains("?duration")){
link = link.Substring(0, link.IndexOf("?duration"));
}
string name = link.Substring(link.LastIndexOf("/")+1);
if (audios.ContainsKey(name)){ continue; }
if (link.StartsWith("./")){
link = "https://img.nga.178.com/attachments" + link.Substring(1);
}
if (name.Contains("nga_audio")){
name.Replace("nga_audio", String.Format("audio_{0}", special));
special += 1;
}
audios.Add(name, link);
}
}
return audios;
}
/// <summary>
/// 下载具体图片
/// </summary>
/// <param name="images"></param>
public int download_media(string tid, Dictionary<string, string> media){
string path = String.Format("./{0}", tid);
if (!Directory.Exists(path)){
Directory.CreateDirectory(path);
}
int count = 0;
foreach (string name in media.Keys){
string link = media[name];
byte[] body;
lblProgress.Text = name;
Application.DoEvents();
string media_path = String.Format("./{0}/{1}", tid, name);
if (File.Exists(media_path)){
continue;
}
using (WebClient dl = new WebClient()){
dl.Headers.Add("User-agent", MainForm.user_agent);
dl.Headers.Add("Cookie", MainForm.default_cookie);
body = dl.DownloadData(link);
}
if (body.Length <= 12000){
// 排除过小的图片,可能是表情包
continue;
}
using (BinaryWriter bw = new BinaryWriter(File.Open(media_path, FileMode.Create))){
bw.Write(body);
}
count += 1;
Thread.Sleep(1000);
}
return count;
}
void BtnImageClick(object sender, EventArgs e)
{
string tid = inputTid.Text;
string[] pages = scan_pages(tid);
Dictionary<string, string> images = refine_image_links(pages);
int count = download_media(tid, images);
lblProgress.Text = String.Format("共下载{0}张图片", count);
}
void MainFormFormClosing(object sender, FormClosingEventArgs e)
{
Environment.Exit(0);
}
void BtnAudioClick(object sender, EventArgs e)
{
string tid = inputTid.Text;
string[] pages = scan_pages(tid);
Dictionary<string, string> audios = refine_audio_links(pages);
int count = download_media(tid, audios);
lblProgress.Text = String.Format("共下载{0}个音频", count);
}
}
}