Пример #1
0
        ParseFetchedFile(FetchedFile fetchedFile, IList <string> inLinkTexts)
        {
            UniversalFileParser parser = new UniversalFileParser();

            parser.SetFile(fetchedFile.MimeType, fetchedFile.Url, fetchedFile.Charset, fetchedFile.FilePath);

            var content     = parser.GetContentAsync().GetAwaiter().GetResult();
            var textContent = parser.GetTextContentAsync().GetAwaiter().GetResult();
            var title       = parser.GetTitleAsync().GetAwaiter().GetResult();
            var headers     = parser.GetHeadersAsync().GetAwaiter().GetResult();
            var links       = parser.GetLinksAsync().GetAwaiter().GetResult();
            var tokens      = parser.GetTokensAsync(inLinkTexts).GetAwaiter().GetResult();

            var urlFile = new UrlFile
            {
                PublishDate       = (ulong)parser.GetPublishDateAsync().GetAwaiter().GetResult().ToBinary(),
                Charset           = fetchedFile.Charset,
                Content           = content,
                FileHash          = fetchedFile.FileHash,
                FilePath          = fetchedFile.FilePath,
                MimeType          = fetchedFile.MimeType,
                TextContent       = textContent,
                Title             = title,
                Url               = fetchedFile.Url,
                HeaderTotalLength = (uint)headers.Sum(o => o.Text.Length),
                HeaderCount       = (uint)headers.Sum(o => (6 - o.Level)),
                InLinkCount       = (uint)inLinkTexts.Count,
                InLinkTotalLength = (uint)inLinkTexts.Sum(o => o.Length),
                PageRank          = 0.1,
            };

            return(urlFile, links, tokens);
        }
Пример #2
0
        /// <summary>
        /// Fetch the web content in the specific url
        /// </summary>
        /// <param name="url">Url in which to fetch the content</param>
        /// <returns>Local url to downloaded content</returns>
        public async Task <FetchedFile> FetchAsync(string url)
        {
            var uri = new Uri(url);

            if (uri.Scheme != "http" && uri.Scheme != "https")
            {
                throw new NotSupportedException("Not supported Uri Scheme: " + uri.Scheme);
            }

            // Target path to save downloaded web file
            var filePath = Path.Combine(mConfig.FetchDirectory, UrlHelper.UrlToFileName(url));

            var client = mConfig.NotUseProxyUrlRegex.IsMatch(url) ? mClientWithoutProxy : mClientWithProxy;

            var response = await client.GetAsync(url);

            var statusCode  = response.StatusCode;
            var contentType = response.Content.Headers.ContentType;

            if (statusCode != HttpStatusCode.Accepted &&
                statusCode != HttpStatusCode.OK)
            {
                mLogger.Log(nameof(SimpleFetcher), "Status: " + statusCode + " " + url);
                throw new IOException(statusCode.ToString());
            }

            if (mConfig.UsePhantomJS && (contentType == null || contentType.MediaType == "text/html"))
            {
                // If config is set to use PhantomJS and web content type is HTML,
                // use PhantomJS to fetch real web page content
                File.WriteAllText(filePath,
                                  FetchPageContentByPhantomJS(url, mConfig.PhantomJSDriverPath));
            }
            else
            {
                // Otherwise, directly save it if supported by parser
                if (contentType == null || UniversalFileParser.IsSupported(contentType.MediaType))
                {
                    using (Stream contentStream = await response.Content.ReadAsStreamAsync(),
                           stream = File.Create(filePath))
                    {
                        await contentStream.CopyToAsync(stream);
                    }
                }
                else
                {
                    throw new NotSupportedException("Not supported media type: " + contentType?.MediaType);
                }
            }

            #region Detect Content MIME Type
            var detectedContentType = MimeHelper.GetContentType(filePath);
            if ((detectedContentType == null ||
                 detectedContentType == "application/octet-stream" ||
                 detectedContentType == "inode/x-empty") &&
                contentType != null)
            {
                detectedContentType = contentType.MediaType;
            }

            if (detectedContentType == null)
            {
                File.Delete(filePath);
                mLogger.Log(nameof(SimpleFetcher), "Unknown MIME type: " + url);
                return(null);
            }

            if (!UniversalFileParser.IsSupported(detectedContentType))
            {
                File.Delete(filePath);
                mLogger.Log(nameof(SimpleFetcher), $"Deleted Not-Supported MIME type ({detectedContentType}): {url}");
                return(null);
            }
            #endregion

            var detectedCharset = DetectEncoding(filePath, contentType.CharSet);

            return(new FetchedFile
            {
                Url = url,
                FilePath = filePath,
                Charset = detectedCharset,
                MimeType = detectedContentType,
                FileHash = HashHelper.GetFileMd5(filePath),
            });
        }