ParseFetchedFile(FetchedFile fetchedFile, IList <string> inLinkTexts) { UniversalFileParser parser = new UniversalFileParser(); parser.SetFile(fetchedFile.MimeType, fetchedFile.Url, fetchedFile.Charset, fetchedFile.FilePath); var content = parser.GetContentAsync().GetAwaiter().GetResult(); var textContent = parser.GetTextContentAsync().GetAwaiter().GetResult(); var title = parser.GetTitleAsync().GetAwaiter().GetResult(); var headers = parser.GetHeadersAsync().GetAwaiter().GetResult(); var links = parser.GetLinksAsync().GetAwaiter().GetResult(); var tokens = parser.GetTokensAsync(inLinkTexts).GetAwaiter().GetResult(); var urlFile = new UrlFile { PublishDate = (ulong)parser.GetPublishDateAsync().GetAwaiter().GetResult().ToBinary(), Charset = fetchedFile.Charset, Content = content, FileHash = fetchedFile.FileHash, FilePath = fetchedFile.FilePath, MimeType = fetchedFile.MimeType, TextContent = textContent, Title = title, Url = fetchedFile.Url, HeaderTotalLength = (uint)headers.Sum(o => o.Text.Length), HeaderCount = (uint)headers.Sum(o => (6 - o.Level)), InLinkCount = (uint)inLinkTexts.Count, InLinkTotalLength = (uint)inLinkTexts.Sum(o => o.Length), PageRank = 0.1, }; return(urlFile, links, tokens); }
/// <summary> /// Fetch the web content in the specific url /// </summary> /// <param name="url">Url in which to fetch the content</param> /// <returns>Local url to downloaded content</returns> public async Task <FetchedFile> FetchAsync(string url) { var uri = new Uri(url); if (uri.Scheme != "http" && uri.Scheme != "https") { throw new NotSupportedException("Not supported Uri Scheme: " + uri.Scheme); } // Target path to save downloaded web file var filePath = Path.Combine(mConfig.FetchDirectory, UrlHelper.UrlToFileName(url)); var client = mConfig.NotUseProxyUrlRegex.IsMatch(url) ? mClientWithoutProxy : mClientWithProxy; var response = await client.GetAsync(url); var statusCode = response.StatusCode; var contentType = response.Content.Headers.ContentType; if (statusCode != HttpStatusCode.Accepted && statusCode != HttpStatusCode.OK) { mLogger.Log(nameof(SimpleFetcher), "Status: " + statusCode + " " + url); throw new IOException(statusCode.ToString()); } if (mConfig.UsePhantomJS && (contentType == null || contentType.MediaType == "text/html")) { // If config is set to use PhantomJS and web content type is HTML, // use PhantomJS to fetch real web page content File.WriteAllText(filePath, FetchPageContentByPhantomJS(url, mConfig.PhantomJSDriverPath)); } else { // Otherwise, directly save it if supported by parser if (contentType == null || UniversalFileParser.IsSupported(contentType.MediaType)) { using (Stream contentStream = await response.Content.ReadAsStreamAsync(), stream = File.Create(filePath)) { await contentStream.CopyToAsync(stream); } } else { throw new NotSupportedException("Not supported media type: " + contentType?.MediaType); } } #region Detect Content MIME Type var detectedContentType = MimeHelper.GetContentType(filePath); if ((detectedContentType == null || detectedContentType == "application/octet-stream" || detectedContentType == "inode/x-empty") && contentType != null) { detectedContentType = contentType.MediaType; } if (detectedContentType == null) { File.Delete(filePath); mLogger.Log(nameof(SimpleFetcher), "Unknown MIME type: " + url); return(null); } if (!UniversalFileParser.IsSupported(detectedContentType)) { File.Delete(filePath); mLogger.Log(nameof(SimpleFetcher), $"Deleted Not-Supported MIME type ({detectedContentType}): {url}"); return(null); } #endregion var detectedCharset = DetectEncoding(filePath, contentType.CharSet); return(new FetchedFile { Url = url, FilePath = filePath, Charset = detectedCharset, MimeType = detectedContentType, FileHash = HashHelper.GetFileMd5(filePath), }); }