MiniCrawler: A skeletal Web crawler

   

/*
C#: The Complete Reference 
by Herbert Schildt 

Publisher: Osborne/McGraw-Hill (March 8, 2002)
ISBN: 0072134852
*/


// MiniCrawler: A skeletal Web crawler. 
 
using System; 
using System.Net; 
using System.IO; 
 
public class MiniCrawler {  
 
  // Find a link in a content string. 
  static string FindLink(string htmlstr,  
                         ref int startloc) { 
    int i; 
    int start, end; 
    string uri = null; 
    string lowcasestr = htmlstr.ToLower(); 
 
    i = lowcasestr.IndexOf("href="http", startloc); 
    if(i != -1) { 
      start = htmlstr.IndexOf('"', i) + 1; 
      end = htmlstr.IndexOf('"', start); 
      uri = htmlstr.Substring(start, end-start); 
      startloc = end; 
    } 
             
    return uri; 
  } 
 
  public static void Main(string[] args) { 
    string link = null; 
    string str; 
    string answer; 
 
    int curloc; // holds current location in response 
 
    if(args.Length != 1) { 
      Console.WriteLine("Usage: MiniCrawler <uri>"); 
      return ; 
    } 
 
    string uristr = args[0]; // holds current URI 
 
    try { 
 
      do { 
        Console.WriteLine("Linking to " + uristr); 
 
        /* Create a WebRequest to the specified URI. */
        HttpWebRequest req = (HttpWebRequest) 
               WebRequest.Create(uristr); 
 
        uristr = null; // disallow further use of this URI 
 
        // Send that request and return the response. 
        HttpWebResponse resp = (HttpWebResponse) 
               req.GetResponse(); 
 
        // From the response, obtain an input stream. 
        Stream istrm = resp.GetResponseStream(); 
 
        // Wrap the input stream in a StreamReader. 
        StreamReader rdr = new StreamReader(istrm); 
 
        // Read in the entire page. 
        str = rdr.ReadToEnd(); 
 
        curloc = 0; 
        
        do { 
          // Find the next URI to link to. 
          link = FindLink(str, ref curloc); 
 
          if(link != null) { 
            Console.WriteLine("Link found: " + link); 
 
            Console.Write("Link, More, Quit?"); 
            answer = Console.ReadLine(); 
 
            if(string.Compare(answer, "L", true) == 0) { 
              uristr = string.Copy(link); 
              break; 
            } else if(string.Compare(answer, "Q", true) == 0) { 
              break; 
            } else if(string.Compare(answer, "M", true) == 0) { 
              Console.WriteLine("Searching for another link."); 
            } 
          } else { 
            Console.WriteLine("No link found."); 
            break; 
          } 
 
        } while(link.Length > 0); 
 
        // Close the Response. 
        resp.Close(); 
      } while(uristr != null); 
 
    } catch(WebException exc) { 
      Console.WriteLine("Network Error: " + exc.Message +  
                        "
Status code: " + exc.Status); 
    } catch(ProtocolViolationException exc) { 
      Console.WriteLine("Protocol Error: " + exc.Message); 
    } catch(UriFormatException exc) { 
      Console.WriteLine("URI Format Error: " + exc.Message); 
    } catch(NotSupportedException exc) { 
      Console.WriteLine("Unknown Protocol: " + exc.Message); 
    } catch(IOException exc) { 
      Console.WriteLine("I/O Error: " + exc.Message); 
    } 
 
    Console.WriteLine("Terminating MiniCrawler."); 
  } 
}


           
          


Check the ContentType

   



using System;
using System.IO;
using System.Net;
   
class HtmlDump
{
     public static int Main(string[] astrArgs)
     {
          WebRequest webreq;
          WebResponse webres;
   
          try
          {
               webreq = WebRequest.Create("http://www.kutayzorlu.com/java2s/com/");
               webres = webreq.GetResponse();
          }
          catch (Exception exc)
          {
               Console.WriteLine("HtmlDump: {0}", exc.Message);
               return 1;
          }
   
          if (webres.ContentType.Substring(0, 4) != "text")
          {
               Console.WriteLine("HtmlDump: URI must be a text type.");
               return 1;
          }
   
          Stream       stream = webres.GetResponseStream();
          StreamReader strrdr = new StreamReader(stream);
          string       strLine;
   
          while ((strLine = strrdr.ReadLine()) != null){
               Console.WriteLine(strLine);
          }
          stream.Close();
          return 0;
     }
}


           
          


Create GetResponse from WebRequest

   


using System;
using System.Net;
using System.IO;
using System.Drawing;
using System.Windows.Forms;

public class MainClass {
    public static void Main() {
        string picUri = "http://international.us.server12.fileserver.kutayzorlu.com/files/download/2017/01/Hex_RGB4_uuid-fa046b6c-fa2c-4f3e-838c-80ed56faee5c_crc-0.jpg";
        string htmlUri = "http://www.apress.com";

        WebRequest requestPic = WebRequest.Create(picUri);
        WebRequest requestHtml = WebRequest.Create(htmlUri);

        WebResponse responsePic = requestPic.GetResponse();
        WebResponse responseHtml = requestHtml.GetResponse();

        Image img = Image.FromStream(responsePic.GetResponseStream());

        using (StreamReader r = new StreamReader(responseHtml.GetResponseStream())) {
            Console.WriteLine(r.ReadToEnd());
        }
    }
}
           
          


Output webpage content

   



using System.Net;
using System;
using System.IO;
public class WebPagesApp {
    [STAThread]
    public static void Main(string[] args) {
        string s = "http://www.microsoft.com";
        Uri uri = new Uri(s);
        WebRequest req = WebRequest.Create(uri);
        WebResponse resp = req.GetResponse();
        Stream str = resp.GetResponseStream();
        StreamReader sr = new StreamReader(str);

        string t = sr.ReadToEnd();
        int i = t.IndexOf("<HEAD>");
        int j = t.IndexOf("</HEAD>");
        string u = t.Substring(i, j);
        Console.WriteLine("{0}", u);
    }
}

           
          


Download a web page in a thread

   


using System;
using System.Net;
using System.Threading;

class ThreadTest {
    static void Main() {
        new Thread(Download).Start();
        Console.WriteLine("download&#039;s happening!");
        Console.ReadLine();
    }

    static void Download() {
        using (WebClient wc = new WebClient())
            try {
                wc.Proxy = null;
                wc.DownloadFile("http://www.google.com", "index.html");
                Console.WriteLine("Finished!");
            } catch (Exception ex) {
            }
    }
}
           
          


Build the DownloadString

   





using System;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;

class MainClass {
    private static void Main() {
        string remoteUri = "http://www.apress.com";
        WebClient client = new WebClient();
        string str = client.DownloadString(remoteUri);
        MatchCollection matches = Regex.Matches(str, @"httpS+[^-,;:?].gif");
        foreach (Match match in matches) {
            foreach (Group grp in match.Groups) {
                string file = grp.Value.Substring(grp.Value.LastIndexOf(&#039;/&#039;) + 1);
                try {
                    Console.WriteLine("Downloading {0} to file {1}", grp.Value, file);
                    client.DownloadFile(new Uri(grp.Value), file);
                } catch {
                    Console.WriteLine("Failed to download {0}", grp.Value);
                }
            }
        }
    }
}
           
          


Set the BaseAddress for WebClient

   


using System;
using System.Collections.Generic;
using System.Text;
using System.Net;

class Program {
    static void Main(string[] args) {
        WebClient client = new WebClient();
        client.BaseAddress = "http://www.microsoft.com";
        string data = client.DownloadString("Office");
        Console.WriteLine(data);

    }
}