示例#1
0
        private void ThreadStartFunction()
        {
            CrawlerWebRequest request = null;

            while (state_ == EngineState.engine_state_running &&
                   int.Parse(Thread.CurrentThread.Name) < this.threadsCount)
            {
                CrawlerUri uri = DequeueUri();
                if (uri != null && uri.depth_ <= crawlerSettings_.maxDepth_)
                {
                    if (crawlerOutput_.FindCrawlerUri(uri) != null) // this uri has been parsed.
                    {
                        continue;                                   // do not parse it.
                    }
                    ParseUri(uri, ref request);
                    if (crawlerSettings_.sleepConnectTime_ > 0)
                    {
                        Thread.Sleep(crawlerSettings_.sleepConnectTime_ * 1000);
                    }
                }
                else
                {
                    Thread.Sleep(crawlerSettings_.sleepFetchTime_ * 1000);
                }
            }
        }
示例#2
0
        // pop uri from the queue
        private CrawlerUri DequeueUri()
        {
            CrawlerUri uri = null;

            Monitor.Enter(queueUrls_);
            try
            {
                uri = (CrawlerUri)queueUrls_.Dequeue();
            }
            catch (Exception)
            {
            }
            Monitor.Exit(queueUrls_);
            return(uri);
        }
        // add Uri to the storage
        public bool AddUri(CrawlerUri uri)
        {
            Monitor.Enter(uriStorage);
            bool bRet = false;

            try
            {
                uriStorage.AddTreeNode(uri);
                uriCount_++;
            }
            catch (Exception)
            {
            }
            Monitor.Exit(uriStorage);

            return(bRet);
        }
示例#4
0
        // push uri to the queue
        private bool EnqueueUri(CrawlerUri uri, bool bCheckRepetition = true)
        {
            // check whether the uri has been parsed or not
            if (bCheckRepetition == true &&
                crawlerOutput_.FindCrawlerUri(uri) != null)
            {
                return(false); // the uri has been parsed already, do not parse it again
            }
            Monitor.Enter(queueUrls_);
            try
            {
                // add the uri to the queue
                queueUrls_.Enqueue(uri);
            }
            catch (Exception)
            {
            }
            Monitor.Exit(queueUrls_);

            return(true);
        }
示例#5
0
        // begin download and parse web site
        public int RunCrawling()
        {
            state_ = EngineState.engine_state_running;
            string fullUrl = crawlerInput_.fullUrl_;

            CrawlerUri.Normalize(ref fullUrl);
            CrawlerUri seedUri = new CrawlerUri(fullUrl);

            EnqueueUri(seedUri, true);
            crawlerInput_.fullUrl_ = fullUrl;
            threadsCount_          = crawlerSettings_.threadsCount_;

            System.Diagnostics.Debug.Assert(crawlerOutput_.FindCrawlerUri(seedUri) == null);
            crawlerOutput_.AddUri(seedUri);

            try
            {
                for (int nIndex = 0; nIndex < threadsCount_; nIndex++)
                {
                    // check if thread not created or not suspended
                    if (threadsRunning_[nIndex] == null || threadsRunning_[nIndex].ThreadState != ThreadState.Suspended)
                    {
                        // create new thread
                        threadsRunning_[nIndex] = new Thread(new ThreadStart(ThreadStartFunction));
                        // set thread name equal to its index
                        threadsRunning_[nIndex].Name = nIndex.ToString();
                        // start thread working function
                        threadsRunning_[nIndex].Start();
                        // join
                        threadsRunning_[nIndex].Join();
                    }
                }
            }
            catch (Exception ex)
            {
                string errMsg = ex.Message;
            }

            return(0);
        }
示例#6
0
        private void ParseUri(CrawlerUri uri, ref CrawlerWebRequest request)
        {
            try
            {
                // create web request
                request = CrawlerWebRequest.Create(uri, request, crawlerSettings_.keepAlive_);
                // set request timeout
                request.timeOut_ = crawlerSettings_.requestTimeout_ * 1000;
                // retrieve response from web request
                CrawlerWebResponse response = request.GetResponse();

                // check for redirection
                if (response.responseUri_.Equals(uri) == false)
                {
                    // add the new uri to the queue
                    EnqueueUri(new CrawlerUri(response.responseUri_.AbsoluteUri), true);
                    request = null;
                    return;
                }

                // check for allowed MIME types
                if (crawlerSettings_.allowAllMIMETypes_ == false &&
                    crawlerSettings_.allowedMIMETypes_.Length > 0 &&
                    response.contentType_ != null)
                {
                    // sample response.contentType: text/html; charset=utf-8
                    string strContentType = response.contentType_.ToLower();
                    int    nIndex         = strContentType.IndexOf(';');
                    if (nIndex != -1)
                    {
                        strContentType = strContentType.Substring(0, nIndex);
                    }

                    nIndex = crawlerSettings_.allowedMIMETypes_.IndexOf(strContentType);
                    if (strContentType.IndexOf('*') == -1 && nIndex == -1)
                    {
                        // log: this MIME type is not listed.
                        request    = null;
                        uri.state_ = CrawlerUriParseState.uri_state_not_allowed_MIME_type;
                        return;
                    }

                    // find numbers
                    Match match = new Regex(@"\d+").Match(crawlerSettings_.allowedMIMETypes_, nIndex);
                    int   nMin  = int.Parse(match.Value) * 1024;
                    match = match.NextMatch();
                    int nMax = int.Parse(match.Value) * 1024;
                    if (nMin < nMax && (response.contentLength_ < nMin || response.contentLength_ > nMax))
                    {
                        // TODO: Content's length is not correct, minimize length is nMin, maximize length is nMax,
                        // but content's length is response.contentLength_
                        request = null;
                        return;
                    }
                }

                // check for excluded response file extension
                bool shouldBeParsed = true;
                foreach (string ext in crawlerSettings_.excludeFiles_)
                {
                    if (ext.Trim().Length > 0 && uri.AbsoluteUri.ToLower().EndsWith(ext) == true)
                    {
                        shouldBeParsed = false;
                        break;
                    }
                }

                // construct path in the hard disk
                string strLocalPath = uri.LocalPath;
                // check if the path ends with / to can crate the file on the HD
                if (strLocalPath.EndsWith("/") == true)
                {
                    // check if there is no query like (.asp?i=32&j=212)
                    if (uri.Query == "")
                    {
                        // add a default name for / ended paths
                        strLocalPath += "default.html";
                    }
                }
                // check if the uri includes a query string
                if (uri.Query != "")
                {
                    // construct the name from the query hash value to be the same if we download it again
                    strLocalPath += uri.Query.GetHashCode() + ".html";
                }
                // construct the full path folder
                string BasePath = crawlerSettings_.downloadfolder_ + "\\" + uri.Host + Path.GetDirectoryName(uri.AbsolutePath);
                // check if the folder not found
                if (Directory.Exists(BasePath) == false)
                {
                    // create the folder
                    Directory.CreateDirectory(BasePath);
                }
                // construct the full path name of the file
                string PathName = crawlerSettings_.downloadfolder_ + "\\" + uri.Host + strLocalPath.Replace("%20", " ");
                // open the output file
                FileStream   fStream = File.Open(PathName, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);
                BinaryWriter bWriter = new BinaryWriter(fStream);

                // receive response buffer
                string strResponse = "";
                byte[] recvBuffer = new byte[8192];
                int    nBytes, nTotalBytes = 0;
                // loop to receive response buffer
                while ((nBytes = response.socket_.Receive(recvBuffer, 0, 8192, SocketFlags.None)) > 0)
                {
                    // increment total received bytes
                    nTotalBytes += nBytes;
                    // write received buffer to file
                    bWriter.Write(recvBuffer, 0, nBytes);
                    // check if the uri type not binary to can be parsed for refs
                    if (shouldBeParsed == true)
                    {
                        // add received buffer to response string
                        strResponse += Encoding.ASCII.GetString(recvBuffer, 0, nBytes);
                    }
                    // check if connection Keep-Alive to can break the loop if response completed
                    if (response.keepAlive_ && nTotalBytes >= response.contentLength_ && response.contentLength_ > 0)
                    {
                        break;
                    }
                }
                // close output stream
                bWriter.Close();
                fStream.Close();

                if (!response.keepAlive_)
                {
                    // close response
                    response.Close();
                }
                // increment total file count
                crawlerOutput_.fileCount_++;
                // increment total bytes count
                crawlerOutput_.byteCount_ += nTotalBytes;

                if (shouldBeParsed == true)
                {
                    // check for restricted words
                    foreach (string strExcludeWord in crawlerSettings_.excludeWords_)
                    {
                        if (strExcludeWord.Trim().Length > 0 && strResponse.IndexOf(strExcludeWord) != -1)
                        {
                            File.Delete(PathName);
                            return;
                        }
                    }

                    // parse the page to search for refs
                    string          strRef  = @"(href|HREF|src|SRC)[ ]*=[ ]*[""'][^""'#>]+[""']";
                    MatchCollection matches = new Regex(strRef).Matches(strResponse);
                    crawlerOutput_.uriCount_ += matches.Count;
                    foreach (Match match in matches)
                    {
                        strRef = match.Value.Substring(match.Value.IndexOf('=') + 1).Trim('"', '\'', '#', ' ', '>');
                        try
                        {
                            if (strRef.IndexOf("..") != -1 || strRef.StartsWith("/") == true || strRef.StartsWith("http://") == false)
                            {
                                strRef = new Uri(uri, strRef).AbsoluteUri;
                            }
                            CrawlerUri.Normalize(ref strRef);
                            CrawlerUri newUri = new CrawlerUri(strRef);
                            if (newUri.Scheme != Uri.UriSchemeHttp && newUri.Scheme != Uri.UriSchemeHttps)
                            {
                                continue;
                            }
                            if (newUri.Host != uri.Host && crawlerSettings_.keepSameServer_ == true)
                            {
                                continue;
                            }
                            newUri.depth_ = uri.depth_ + 1;
                            EnqueueUri(newUri, true);
                        }
                        catch (Exception)
                        {
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                request = null;
            }
            finally
            {
            }
        }
 // find Uri from urlStorage
 public CrawlerBinaryTreeNode <CrawlerUri> FindCrawlerUri(CrawlerUri uri)
 {
     return(uriStorage.FindTreeNodeByData(uri));
 }