How to get all link addresses in Baidu search results in C#

test · Posted on 4/11/2015 9:06:01 PM

How to get the address of all links in Baidu search results in C#?
That is, through keywords from Baidu search results, and then export the URL in batches, how to do it?

I didn't find the relevant source code from the Internet、、、

Little scum · Posted on 4/11/2015 9:35:57 PM

using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Web;
using System.Net;
using System.IO;
namespace baiduRobotStrim
{
struct BaiduEntry
{
public string title, brief, link;
}
class Program
{
static string GetHtml(string keyword)
{
string url = @"http://www.baidu.com/";
string encodedKeyword = HttpUtility.UrlEncode(keyword, Encoding.GetEncoding(936));
//百度使用codepage 936字符编码来作为查询串，果然专注于中文搜索……
//更不用说，还很喜欢微软
//谷歌能正确识别UTF-8编码和codepage这两种情况，不过本身网页在HTTP头里标明是UTF-8的
//估计谷歌也不讨厌微软（以及微软的专有规范）
string query = "s?wd=" + encodedKeyword;
HttpWebRequest req;
HttpWebResponse response;
Stream stream;
req = (HttpWebRequest)WebRequest.Create(url + query);
response = (HttpWebResponse)req.GetResponse();
stream = response.GetResponseStream();
int count = 0;
byte[] buf = new byte[8192];
string decodedString = null;
StringBuilder sb = new StringBuilder();
try
{
Console.WriteLine("正在读取网页{0}的内容……", url + query);
do
{
count = stream.Read(buf, 0, buf.Length);
if (count > 0)
{
decodedString = Encoding.GetEncoding(936).GetString(buf, 0, count);
sb.Append(decodedString);
}
} while (count > 0);
}
catch
{
Console.WriteLine("网络连接失败，请检查网络设置。");
}
return sb.ToString();
}
static void PrintResult(List<BaiduEntry> entries)
{
int count = 0;
entries.ForEach(delegate(BaiduEntry entry)
{
Console.WriteLine("找到了百度的第{0}条搜索结果：", count += 1);
if (entry.link != null)
{
Console.WriteLine("找到了一条链接：");
Console.WriteLine(entry.link);
}
if (entry.title != null)
{
Console.WriteLine("标题为：");
Console.WriteLine(entry.title);
}
if (entry.brief != null)
{
Console.WriteLine("下面是摘要：");
Console.WriteLine(entry.brief);
}
Program.Cut();
});
}
static void simpleOutput()
{
string html = "<table><tr><td><font>test</font><a>hello</a><br></td></tr></table>";
Console.WriteLine(RemoveSomeTags(html));
}
static string RemoveVoidTag(string html)
{
string[] filter = { "<br>" };
foreach (string tag in filter)
{
html = html.Replace(tag, "");
}
return html;
}
static string ReleaseXmlTags(string html)
{
string[] filter = { "<a.*?>", "</a>", "<em>", "</em>", "<b>", "</b>", "<font.*?>", "</font>" };
foreach (string tag in filter)
{
html = Regex.Replace(html, tag, "");
}
return html;
}
static string RemoveSomeTags(string html)
{
html = RemoveVoidTag(html);
html = ReleaseXmlTags(html);
return html;
}
static void Cut()
{
Console.WriteLine("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
}
static void MainProc(string input)
{
MainProc(input, false);
}
static void MainProc(string input, bool tagsForBrief)
{
Regex r = new Regex("<table.*?</table>", RegexOptions.IgnoreCase);
//提取出(<table>,</table>)对，并等待进一步处理。
Match m = r.Match(input);
List<string> collection = new List<string>();
while (m.Success)
{
collection.Add(m.Value);
//找出tagname为table的节点并存储到collection变量中
m = m.NextMatch();
}
List<BaiduEntry> entries = new List<BaiduEntry>();
collection.ForEach(delegate(string entry)
{
r = new Regex("<td.*?>(.*)</td>", RegexOptions.IgnoreCase);
if(r.IsMatch(entry))
{//从entry字符串里捕获到的就是百度里存储在每个table标签里的td标签了。
//现阶段中，百度页面里有几个table标签是兄弟节点的关系，
//第一个table标签是一个广告，剩下的table标签刚好都是搜索结果。
//理想状态下input字符串里只有几个由table标签组织的搜索结果项。
//理应使用预处理过的字符串来调用本函数
m = r.Match(entry);
string html = m.Groups[1].Value;//直接使用捕获分组1的值。
//html变量里存储着td节点的innerHTML，那里有真正的搜索结果
BaiduEntry baidu = new BaiduEntry();
r = new Regex("<a.*?href="(.*?)".*?>", RegexOptions.IgnoreCase);
if (r.IsMatch(html))
{
string linkString = r.Match(html).Groups[1].Captures[0].Value;
baidu.link = linkString;
}
r = new Regex("<font.*</font>");
//td节点下有一些嵌套了2层的font标签，把这个大的font标签拿下来。
html = r.Match(html).Value;//现在html变量里存储着比较浓缩的信息了。
r = new Regex("<font.*?>(.*?)</font>");
Match contentMatch = r.Match(html);
if (contentMatch.Success)
{
string title = contentMatch.Groups[1].Captures[0].Value;
title = RemoveSomeTags(title);
baidu.title = title;
contentMatch = contentMatch.NextMatch();
if (contentMatch.Success)
{
string brief = contentMatch.Groups[1].Captures[0].Value;
int splitIndex = brief.IndexOf("<font");
if (splitIndex > -1)
brief = brief.Substring(0, splitIndex);
if (!tagsForBrief)
brief = RemoveSomeTags(brief);
//如果不需要带有HTML格式的摘要，那么就处理掉HTML标签
baidu.brief = brief;
}
}
else
{
if (html == "") return;
Console.WriteLine("怪了，这里没有找到任何结果。");
Console.WriteLine("如果百度已经更改了页面的结构那么程序需要重新设计。");
Console.WriteLine("Mark:");
Console.WriteLine(html);
Cut();
Cut();
Cut();
}
entries.Add(baidu);
}
});
PrintResult(entries);
}
public static void Main(string[] args)
{
Console.WriteLine("请输入一个关键字。");
string keyword;
keyword = Console.ReadLine();
Console.WriteLine("正在从百度上获取结果，请稍等……");
string input;
input = GetHtml(keyword);
Regex r = new Regex("<table.*class="result"[\\s\\S]*</table>", RegexOptions.IgnoreCase);
input = r.Match(input).Value;
MainProc(input);
Console.ReadKey(true);
}
}
}

Copy code

Little scum · Posted on 4/11/2015 9:36:19 PM

using System;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using System.Net;
using System.IO;
namespace ConsoleDemo
{
public class Program
{
static void Main(string[] args)
{
BaiduSearch baidu = new BaiduSearch();
Console.WriteLine("请输入单词：");
string word = Console.ReadLine();
//"腾讯抄袭";
//string word = "software";
string html = baidu.Search(word);
if (!string.IsNullOrEmpty(html))
{
int count = baidu.GetSearchCount(html);
Console.WriteLine(string.Format("word:{0} count:{1}", word, count));
if (count > 0)
{
List<Keyword> keywords = baidu.GetKeywords(html, word);
foreach (Keyword item in keywords)
{
Console.WriteLine(item.Title + " " + item.Link);
}
}
}
Console.ReadLine();
}
private void GetBaiduZhiDao(string urlsearch = @"http://www.baidu.com/baidu?word=%E5%B0%8F%E6%B8%B8%E6%88%8F&tn=360se_8_dg&ie=utf-8")
{
//匿名方法+Lambda，读取网页文本
Func<string, string> GetHtmlCode = url =>
{
WebClient client = new WebClient();
Stream stream = client.OpenRead(url);
StreamReader readerOfStream = new StreamReader(stream, Encoding.GetEncoding("GB2312"));
string html = readerOfStream.ReadToEnd();
stream.Close();
return html;
};
string html_code = GetHtmlCode(urlsearch);
MatchCollection mc = Regex.Matches(html_code, @"(?is)<a[^>]*?href=(""?)(?<url>http[^"" ]+?)\1[^>]*target=""_blank""[^>]*><font size=""3""[^>]*>(?<text>.+?)</a><br[^>]*><font size=(""?)-1\2>");
StringBuilder builder = new StringBuilder();
foreach (Match m in mc)
{
builder.AppendLine("text = " + Regex.Replace(m.Groups["text"].Value, "<[^>]*>", string.Empty));
builder.AppendLine("url = " + m.Groups["url"].Value);
}
Console.WriteLine(builder.ToString());
}
}
class BaiduSearch
{
protected string uri = "http://www.baidu.com/s?wd=";
//protected string uri = "http://www.baidu.com/s?wd=software&pn=10&usm=2"; // 第二页
protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
protected string resultPattern = @"(?<=找到相关结果[约]?)[0-9,]*?(?=个)";
public string Search(string word)
{
string html = string.Empty;
//string uriString = uri + System.Web.HttpUtility.UrlEncode(word, queryEncoding);
string uriString = uri;
try
{
html = WebFunc.GetHtml(uriString, pageEncoding);
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
return html;
}
public int GetSearchCount(string html)
{
int result = 0;
string searchcount = string.Empty;
Regex regex = new Regex(resultPattern);
Match match = regex.Match(html);
if (match.Success)
{
searchcount = match.Value;
}
else
{
searchcount = "0";
}
if (searchcount.IndexOf(",") > 0)
{
searchcount = searchcount.Replace(",", string.Empty);
}
int.TryParse(searchcount, out result);
return result;
}
public List<Keyword> GetKeywords(string html, string word)
{
List<Keyword> keywords = new List<Keyword>();
Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(\d{1}|10)\1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table\b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
//Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(\d{2})\1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table\b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
Regex regA = new Regex(@"(?is)<a\b[^>]*?href=(['""]?)(?<link>[^'""\s>]+)\1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);
MatchCollection mcTable = regTable.Matches(html);
foreach (Match mTable in mcTable)
{
if (mTable.Success)
{
Match mA = regA.Match(mTable.Value);
if (mA.Success)
{
Keyword keyword = new Keyword();
keyword.Link = mA.Groups["link"].Value;
keyword.Title = mA.Groups["title"].Value;
keywords.Add(keyword);
}
}
}
return keywords;
}
}
class Keyword
{
public string Title { get; set; }
public string Link { get; set; }
//private string title;
//public string Title { get { return title; } set { title = value; } }
//private string link;
//public string Link { get { return link; } set { link = value; } }
}
static class WebFunc
{
public static string GetHtml(string url)
{
return GetHtml(url, Encoding.UTF8);
}
public static string GetHtml(string url, Encoding encoding)
{
WebRequest request;
request = WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
WebResponse response;
response = request.GetResponse();
return new StreamReader(response.GetResponseStream(), encoding).ReadToEnd();
}
}
}

Copy code

[WinForm] How to get all link addresses in Baidu search results in C#

Related Posts

Sections viewed