Example #1
0
        ParseFetchedFile(FetchedFile fetchedFile, IList <string> inLinkTexts)
        {
            UniversalFileParser parser = new UniversalFileParser();

            parser.SetFile(fetchedFile.MimeType, fetchedFile.Url, fetchedFile.Charset, fetchedFile.FilePath);

            var content     = parser.GetContentAsync().GetAwaiter().GetResult();
            var textContent = parser.GetTextContentAsync().GetAwaiter().GetResult();
            var title       = parser.GetTitleAsync().GetAwaiter().GetResult();
            var headers     = parser.GetHeadersAsync().GetAwaiter().GetResult();
            var links       = parser.GetLinksAsync().GetAwaiter().GetResult();
            var tokens      = parser.GetTokensAsync(inLinkTexts).GetAwaiter().GetResult();

            var urlFile = new UrlFile
            {
                PublishDate       = (ulong)parser.GetPublishDateAsync().GetAwaiter().GetResult().ToBinary(),
                Charset           = fetchedFile.Charset,
                Content           = content,
                FileHash          = fetchedFile.FileHash,
                FilePath          = fetchedFile.FilePath,
                MimeType          = fetchedFile.MimeType,
                TextContent       = textContent,
                Title             = title,
                Url               = fetchedFile.Url,
                HeaderTotalLength = (uint)headers.Sum(o => o.Text.Length),
                HeaderCount       = (uint)headers.Sum(o => (6 - o.Level)),
                InLinkCount       = (uint)inLinkTexts.Count,
                InLinkTotalLength = (uint)inLinkTexts.Sum(o => o.Length),
                PageRank          = 0.1,
            };

            return(urlFile, links, tokens);
        }
Example #2
0
        public (string Url, string Content) JudgeContent(FetchedFile fetchedFile, string content)
        {
            var sameFiles = mConfig.UrlFileStore.GetUrlFilesByHash(fetchedFile.FileHash);
            var host      = UrlHelper.GetHost(fetchedFile.Url);

            foreach (var file in sameFiles)
            {
                if (file.Url == fetchedFile.Url)
                {
                    continue;
                }

                var currentHost = UrlHelper.GetHost(file.Url);

                bool isSameDns = false;

                try
                {
                    isSameDns = Dns.GetHostAddresses(currentHost).SequenceEqual(Dns.GetHostAddresses(host));
                }
                catch (Exception)
                { }

                if (content == file.Content &&
                    (currentHost == host || isSameDns))
                {
                    mLogger.Log(nameof(SimpleSimilarContentManager), $"Find Same UrlFile for {fetchedFile.Url}: {file.Url}");
                    return(file.Url, file.Content);
                }
            }

            return(null, null);
        }
Example #3
0
        private string GetSameUrl(FetchedFile fetchedFile, string content)
        {
            // Judge if there are other files that have similar content as this
            var(sameUrl, sameContent) = mSimilarContentJudger.JudgeContent(fetchedFile, content);

            lock (mContentUrlDictSyncLock)
            {
                if (mContentUrlDict.ContainsKey(content))
                {
                    return(mContentUrlDict[content]);
                }
                if (sameUrl == null)
                {
                    mContentUrlDict.Add(content, fetchedFile.Url);
                }
            }

            return(sameUrl);
        }
Example #4
0
        protected void FetchUrlAsync(string url)
        {
            mFetchSemaphore.Wait();
            mLogger.Log(nameof(Crawler), "Begin Crawl: " + url, true);
            var t = Task.Run(() =>
            {
                FetchedFile fetchedFile = null;
                try
                {
                    // Fetch Url
                    fetchedFile = FetchUrl(url);
                    if (fetchedFile == null)
                    {
                        return;
                    }

                    // Get InLink Texts
                    var inLinkTexts = GetInLinkTexts(fetchedFile.Url);

                    // Parse File
                    var(urlFile, linkList, tokens) = ParseFetchedFile(fetchedFile, inLinkTexts);


                    if (GetSameUrl(fetchedFile, urlFile.Content) != null)
                    {
                        // Has Same UrlFile, Skip
                        return;
                    }

                    // Save New UrlFile
                    // Get Old id and New id
                    ulong oldUrlFileId;
                    (urlFile, oldUrlFileId) = mConfig.UrlFileStore.SaveUrlFileAndGetOldId(urlFile);

                    // Clear old data
                    if (oldUrlFileId != 0)
                    {
                        mConfig.PostingListStore.ClearPostingLists(oldUrlFileId);
                        mConfig.InvertedIndexStore.ClearIndices(oldUrlFileId);
                        mConfig.LinkStore.ClearLinks(oldUrlFileId);
                    }

                    lock (mContentUrlDictSyncLock)
                    {
                        mContentUrlDict.Remove(urlFile.Content);
                    }

                    // Filter Links
                    linkList = FilterLinks(linkList);

                    // Save links
                    mConfig.LinkStore.SaveLinks(urlFile.UrlFileId,
                                                linkList.Select(o => new Link
                    {
                        Text      = o.Text,
                        Url       = o.Url,
                        UrlFileId = urlFile.UrlFileId,
                    }));

                    // Save Indices
                    SaveIndices(tokens, inLinkTexts, urlFile, oldUrlFileId);

                    // Add newly-found urls
                    var urls = linkList.Select(o => o.Url).Distinct();
                    mUrlFrontier.PushUrls(urls);

                    // Push Back This Url
                    mUrlFrontier.PushBackUrl(url, urlFile.UpdateInterval);

                    mLogger.Log(nameof(Crawler), "End Crawl: " + url);
                }
                catch (NotSupportedException e)
                {
                    mLogger.LogException(nameof(Crawler), "Not supported file format: " + url, e, false);
                    mErrorLogger.LogException(nameof(Crawler), "Not supported file format: " + url, e);

                    // Retry
                    mUrlFrontier.PushBackUrl(url, 0, true);
                }
                catch (InvalidDataException e)
                {
                    mLogger.LogException(nameof(Crawler), "Invalid data: " + url, e);
                    mErrorLogger.LogException(nameof(Crawler), "Invalid data: " + url, e);

                    // Retry
                    mUrlFrontier.PushBackUrl(url, 0, true);
                }
                catch (UriFormatException e)
                {
                    mLogger.LogException(nameof(Crawler), "Invalid Uri: " + url, e);
                    mErrorLogger.LogException(nameof(Crawler), "Invalid Uri: " + url, e);

                    mUrlFrontier.RemoveUrl(url);
                }
                catch (IOException e)
                {
                    mLogger.LogException(nameof(Crawler), "Failed to fetch: " + url, e, false);
                    mErrorLogger.LogException(nameof(Crawler), "Failed to fetch: " + url, e);

                    mUrlFrontier.RemoveUrl(url);
                }
                catch (Exception e) when(
                    e is OperationCanceledException ||
                    e is TaskCanceledException
                    )
                {
                    mUrlFrontier.PushBackUrl(url, 0);
                }
                catch (Exception e)
                {
                    mLogger.LogException(nameof(Crawler), "Failed to crawl url: " + url, e);
                    mErrorLogger.LogException(nameof(Crawler), "Failed to crawl url: " + url, e);

                    // Retry
                    mUrlFrontier.PushBackUrl(url, 0, true);
                }
                finally
                {
                    mFetchSemaphore.Release();
                    if (fetchedFile != null && File.Exists(fetchedFile.FilePath))
                    {
                        File.Delete(fetchedFile.FilePath);
                    }
                }
            }).ContinueWith(task =>
            {
                mTasks.TryRemove(task, out bool v);
            });

            mTasks.TryAdd(t, true);
        }