Exemplo n.º 1
0
        public CrawlJob CreateJob(ParsedLink forLink)
        {
            Uri url;
            try {
                url = new Uri(forLink.Url);
            } catch (UriFormatException ex) {
                // relative URL, build it from current URL
                url = new Uri(Url.ToString().Substring(0, Url.ToString().LastIndexOf('/')) + "/" + forLink.Url);
            }

            CrawlJob job = new CrawlJob(url);
            job.LinkOpts = LinkOpts;

            // TODO: rest of the params plus maybe some checkings
            return job;
        }
Exemplo n.º 2
0
        void Index(CrawlJob job)
        {
            Logger.Debug(string.Format("Parsing URL {0}", job.Url));

            // download content
            IDictionary<string, string> httpHeaders;
            string rawContent;
            try {   
                rawContent = Download(job, out httpHeaders);
            } catch (WebException ex) {
                Logger.Error("Error downloading " + job.Url, ex);
                return;
            }

            // TODO: treat exceptions

            string mimeContentType = httpHeaders["Content-Type"];
            if (mimeContentType.IndexOf(';') > 0)
                mimeContentType = mimeContentType.Substring(0, mimeContentType.IndexOf(';'));

            Logger.Debug(string.Format("  > MIME Content Type: ", mimeContentType));

            IList<IContentParser> parsersByMimeType = FindParser.ByMimeContentType(mimeContentType);
            foreach (IContentParser parser in parsersByMimeType) {
                foreach (ParsedContent parsed in parser.ParseRaw(rawContent, job.LinkOpts)) {

                    // fill in the rest of the data
                    parsed.Location = job.Url.ToString();
                    parsed.Sources = new List<string>() { job.Url.Host };

                    SbApp.Instance.SearchEngine.Index(parsed);

                    // if it has links, index them too
                    if (parsed.LinkOpts.Follow && (parsed.LinkOpts.MaxDepth == -1 || parsed.LinkOpts.CurrentDepth <= parsed.LinkOpts.MaxDepth)) {
                        foreach (var link in parsed.Links) {
                            Index(job.CreateJob(link));
                        }
                    }
                }
            }
        }
Exemplo n.º 3
0
        string Download(CrawlJob opts, out IDictionary<string, string> httpHeaders)
        {
            System.Net.ServicePointManager.Expect100Continue = false;

            HttpWebRequest httpRequest = (HttpWebRequest)WebRequest.Create(opts.Url);
            httpRequest.Timeout = opts.TimeoutSec * 1000;

            HttpWebResponse response = (HttpWebResponse)httpRequest.GetResponse();
            System.IO.StreamReader reader = new System.IO.StreamReader(response.GetResponseStream());
            string strResponse = reader.ReadToEnd();
            response.Close();

            // read HTTP headers
            httpHeaders = new Dictionary<string, string>();
            foreach (string header in response.Headers.AllKeys) {
                httpHeaders[header] = response.Headers[header];
            }

            return strResponse.Trim();
        }