ParseFetchedFile(FetchedFile fetchedFile, IList <string> inLinkTexts) { UniversalFileParser parser = new UniversalFileParser(); parser.SetFile(fetchedFile.MimeType, fetchedFile.Url, fetchedFile.Charset, fetchedFile.FilePath); var content = parser.GetContentAsync().GetAwaiter().GetResult(); var textContent = parser.GetTextContentAsync().GetAwaiter().GetResult(); var title = parser.GetTitleAsync().GetAwaiter().GetResult(); var headers = parser.GetHeadersAsync().GetAwaiter().GetResult(); var links = parser.GetLinksAsync().GetAwaiter().GetResult(); var tokens = parser.GetTokensAsync(inLinkTexts).GetAwaiter().GetResult(); var urlFile = new UrlFile { PublishDate = (ulong)parser.GetPublishDateAsync().GetAwaiter().GetResult().ToBinary(), Charset = fetchedFile.Charset, Content = content, FileHash = fetchedFile.FileHash, FilePath = fetchedFile.FilePath, MimeType = fetchedFile.MimeType, TextContent = textContent, Title = title, Url = fetchedFile.Url, HeaderTotalLength = (uint)headers.Sum(o => o.Text.Length), HeaderCount = (uint)headers.Sum(o => (6 - o.Level)), InLinkCount = (uint)inLinkTexts.Count, InLinkTotalLength = (uint)inLinkTexts.Sum(o => o.Length), PageRank = 0.1, }; return(urlFile, links, tokens); }
public (string Url, string Content) JudgeContent(FetchedFile fetchedFile, string content) { var sameFiles = mConfig.UrlFileStore.GetUrlFilesByHash(fetchedFile.FileHash); var host = UrlHelper.GetHost(fetchedFile.Url); foreach (var file in sameFiles) { if (file.Url == fetchedFile.Url) { continue; } var currentHost = UrlHelper.GetHost(file.Url); bool isSameDns = false; try { isSameDns = Dns.GetHostAddresses(currentHost).SequenceEqual(Dns.GetHostAddresses(host)); } catch (Exception) { } if (content == file.Content && (currentHost == host || isSameDns)) { mLogger.Log(nameof(SimpleSimilarContentManager), $"Find Same UrlFile for {fetchedFile.Url}: {file.Url}"); return(file.Url, file.Content); } } return(null, null); }
private string GetSameUrl(FetchedFile fetchedFile, string content) { // Judge if there are other files that have similar content as this var(sameUrl, sameContent) = mSimilarContentJudger.JudgeContent(fetchedFile, content); lock (mContentUrlDictSyncLock) { if (mContentUrlDict.ContainsKey(content)) { return(mContentUrlDict[content]); } if (sameUrl == null) { mContentUrlDict.Add(content, fetchedFile.Url); } } return(sameUrl); }
protected void FetchUrlAsync(string url) { mFetchSemaphore.Wait(); mLogger.Log(nameof(Crawler), "Begin Crawl: " + url, true); var t = Task.Run(() => { FetchedFile fetchedFile = null; try { // Fetch Url fetchedFile = FetchUrl(url); if (fetchedFile == null) { return; } // Get InLink Texts var inLinkTexts = GetInLinkTexts(fetchedFile.Url); // Parse File var(urlFile, linkList, tokens) = ParseFetchedFile(fetchedFile, inLinkTexts); if (GetSameUrl(fetchedFile, urlFile.Content) != null) { // Has Same UrlFile, Skip return; } // Save New UrlFile // Get Old id and New id ulong oldUrlFileId; (urlFile, oldUrlFileId) = mConfig.UrlFileStore.SaveUrlFileAndGetOldId(urlFile); // Clear old data if (oldUrlFileId != 0) { mConfig.PostingListStore.ClearPostingLists(oldUrlFileId); mConfig.InvertedIndexStore.ClearIndices(oldUrlFileId); mConfig.LinkStore.ClearLinks(oldUrlFileId); } lock (mContentUrlDictSyncLock) { mContentUrlDict.Remove(urlFile.Content); } // Filter Links linkList = FilterLinks(linkList); // Save links mConfig.LinkStore.SaveLinks(urlFile.UrlFileId, linkList.Select(o => new Link { Text = o.Text, Url = o.Url, UrlFileId = urlFile.UrlFileId, })); // Save Indices SaveIndices(tokens, inLinkTexts, urlFile, oldUrlFileId); // Add newly-found urls var urls = linkList.Select(o => o.Url).Distinct(); mUrlFrontier.PushUrls(urls); // Push Back This Url mUrlFrontier.PushBackUrl(url, urlFile.UpdateInterval); mLogger.Log(nameof(Crawler), "End Crawl: " + url); } catch (NotSupportedException e) { mLogger.LogException(nameof(Crawler), "Not supported file format: " + url, e, false); mErrorLogger.LogException(nameof(Crawler), "Not supported file format: " + url, e); // Retry mUrlFrontier.PushBackUrl(url, 0, true); } catch (InvalidDataException e) { mLogger.LogException(nameof(Crawler), "Invalid data: " + url, e); mErrorLogger.LogException(nameof(Crawler), "Invalid data: " + url, e); // Retry mUrlFrontier.PushBackUrl(url, 0, true); } catch (UriFormatException e) { mLogger.LogException(nameof(Crawler), "Invalid Uri: " + url, e); mErrorLogger.LogException(nameof(Crawler), "Invalid Uri: " + url, e); mUrlFrontier.RemoveUrl(url); } catch (IOException e) { mLogger.LogException(nameof(Crawler), "Failed to fetch: " + url, e, false); mErrorLogger.LogException(nameof(Crawler), "Failed to fetch: " + url, e); mUrlFrontier.RemoveUrl(url); } catch (Exception e) when( e is OperationCanceledException || e is TaskCanceledException ) { mUrlFrontier.PushBackUrl(url, 0); } catch (Exception e) { mLogger.LogException(nameof(Crawler), "Failed to crawl url: " + url, e); mErrorLogger.LogException(nameof(Crawler), "Failed to crawl url: " + url, e); // Retry mUrlFrontier.PushBackUrl(url, 0, true); } finally { mFetchSemaphore.Release(); if (fetchedFile != null && File.Exists(fetchedFile.FilePath)) { File.Delete(fetchedFile.FilePath); } } }).ContinueWith(task => { mTasks.TryRemove(task, out bool v); }); mTasks.TryAdd(t, true); }