Csharp/C Sharp/Network/Web Crawler
Содержание
Build the DownloadString
using System;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
class MainClass {
private static void Main() {
string remoteUri = "http://www.apress.ru";
WebClient client = new WebClient();
string str = client.DownloadString(remoteUri);
MatchCollection matches = Regex.Matches(str, @"http\S+[^-,;:?]\.gif");
foreach (Match match in matches) {
foreach (Group grp in match.Groups) {
string file = grp.Value.Substring(grp.Value.LastIndexOf("/") + 1);
try {
Console.WriteLine("Downloading {0} to file {1}", grp.Value, file);
client.DownloadFile(new Uri(grp.Value), file);
} catch {
Console.WriteLine("Failed to download {0}", grp.Value);
}
}
}
}
}
Check the ContentType
using System;
using System.IO;
using System.Net;
class HtmlDump
{
public static int Main(string[] astrArgs)
{
WebRequest webreq;
WebResponse webres;
try
{
webreq = WebRequest.Create("http://www.nfex.ru/");
webres = webreq.GetResponse();
}
catch (Exception exc)
{
Console.WriteLine("HtmlDump: {0}", exc.Message);
return 1;
}
if (webres.ContentType.Substring(0, 4) != "text")
{
Console.WriteLine("HtmlDump: URI must be a text type.");
return 1;
}
Stream stream = webres.GetResponseStream();
StreamReader strrdr = new StreamReader(stream);
string strLine;
while ((strLine = strrdr.ReadLine()) != null){
Console.WriteLine(strLine);
}
stream.Close();
return 0;
}
}
Create GetResponse from WebRequest
using System;
using System.Net;
using System.IO;
using System.Drawing;
using System.Windows.Forms;
public class MainClass {
public static void Main() {
string picUri = "http://www.apress.ru/img/img05/Hex_RGB4.jpg";
string htmlUri = "http://www.apress.ru";
WebRequest requestPic = WebRequest.Create(picUri);
WebRequest requestHtml = WebRequest.Create(htmlUri);
WebResponse responsePic = requestPic.GetResponse();
WebResponse responseHtml = requestHtml.GetResponse();
Image img = Image.FromStream(responsePic.GetResponseStream());
using (StreamReader r = new StreamReader(responseHtml.GetResponseStream())) {
Console.WriteLine(r.ReadToEnd());
}
}
}
Download a web page in a thread
using System;
using System.Net;
using System.Threading;
class ThreadTest {
static void Main() {
new Thread(Download).Start();
Console.WriteLine("download"s happening!");
Console.ReadLine();
}
static void Download() {
using (WebClient wc = new WebClient())
try {
wc.Proxy = null;
wc.DownloadFile("http://www.google.ru", "index.html");
Console.WriteLine("Finished!");
} catch (Exception ex) {
}
}
}
MiniCrawler: A skeletal Web crawler
/*
C#: The Complete Reference
by Herbert Schildt
Publisher: Osborne/McGraw-Hill (March 8, 2002)
ISBN: 0072134852
*/
// MiniCrawler: A skeletal Web crawler.
using System;
using System.Net;
using System.IO;
public class MiniCrawler {
// Find a link in a content string.
static string FindLink(string htmlstr,
ref int startloc) {
int i;
int start, end;
string uri = null;
string lowcasestr = htmlstr.ToLower();
i = lowcasestr.IndexOf("href=\"http", startloc);
if(i != -1) {
start = htmlstr.IndexOf(""", i) + 1;
end = htmlstr.IndexOf(""", start);
uri = htmlstr.Substring(start, end-start);
startloc = end;
}
return uri;
}
public static void Main(string[] args) {
string link = null;
string str;
string answer;
int curloc; // holds current location in response
if(args.Length != 1) {
Console.WriteLine("Usage: MiniCrawler <uri>");
return ;
}
string uristr = args[0]; // holds current URI
try {
do {
Console.WriteLine("Linking to " + uristr);
/* Create a WebRequest to the specified URI. */
HttpWebRequest req = (HttpWebRequest)
WebRequest.Create(uristr);
uristr = null; // disallow further use of this URI
// Send that request and return the response.
HttpWebResponse resp = (HttpWebResponse)
req.GetResponse();
// From the response, obtain an input stream.
Stream istrm = resp.GetResponseStream();
// Wrap the input stream in a StreamReader.
StreamReader rdr = new StreamReader(istrm);
// Read in the entire page.
str = rdr.ReadToEnd();
curloc = 0;
do {
// Find the next URI to link to.
link = FindLink(str, ref curloc);
if(link != null) {
Console.WriteLine("Link found: " + link);
Console.Write("Link, More, Quit?");
answer = Console.ReadLine();
if(string.rupare(answer, "L", true) == 0) {
uristr = string.Copy(link);
break;
} else if(string.rupare(answer, "Q", true) == 0) {
break;
} else if(string.rupare(answer, "M", true) == 0) {
Console.WriteLine("Searching for another link.");
}
} else {
Console.WriteLine("No link found.");
break;
}
} while(link.Length > 0);
// Close the Response.
resp.Close();
} while(uristr != null);
} catch(WebException exc) {
Console.WriteLine("Network Error: " + exc.Message +
"\nStatus code: " + exc.Status);
} catch(ProtocolViolationException exc) {
Console.WriteLine("Protocol Error: " + exc.Message);
} catch(UriFormatException exc) {
Console.WriteLine("URI Format Error: " + exc.Message);
} catch(NotSupportedException exc) {
Console.WriteLine("Unknown Protocol: " + exc.Message);
} catch(IOException exc) {
Console.WriteLine("I/O Error: " + exc.Message);
}
Console.WriteLine("Terminating MiniCrawler.");
}
}
Output webpage content
using System.Net;
using System;
using System.IO;
public class WebPagesApp {
[STAThread]
public static void Main(string[] args) {
string s = "http://www.microsoft.ru";
Uri uri = new Uri(s);
WebRequest req = WebRequest.Create(uri);
WebResponse resp = req.GetResponse();
Stream str = resp.GetResponseStream();
StreamReader sr = new StreamReader(str);
string t = sr.ReadToEnd();
int i = t.IndexOf("<HEAD>");
int j = t.IndexOf("</HEAD>");
string u = t.Substring(i, j);
Console.WriteLine("{0}", u);
}
}
Set the BaseAddress for WebClient
using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
class Program {
static void Main(string[] args) {
WebClient client = new WebClient();
client.BaseAddress = "http://www.microsoft.ru";
string data = client.DownloadString("Office");
Console.WriteLine(data);
}
}