C# (CSharp) UniversalFileParser примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: UniversalFileParser

Примеров на hotexamples.com: 2

C# (CSharp) UniversalFileParser - 2 примера найдено. Это лучшие примеры C# (CSharp) кода для UniversalFileParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetContentAsync(1)

GetHeadersAsync(1)

GetLinksAsync(1)

GetPublishDateAsync(1)

GetTextContentAsync(1)

GetTitleAsync(1)

GetTokensAsync(1)

IsSupported(1)

SetFile(1)

Пример #1

Показать файл

Файл: Crawler.cs Проект: xuhongxu96/XiaoyaSearch

        ParseFetchedFile(FetchedFile fetchedFile, IList <string> inLinkTexts)
        {
            UniversalFileParser parser = new UniversalFileParser();

            parser.SetFile(fetchedFile.MimeType, fetchedFile.Url, fetchedFile.Charset, fetchedFile.FilePath);

            var content     = parser.GetContentAsync().GetAwaiter().GetResult();
            var textContent = parser.GetTextContentAsync().GetAwaiter().GetResult();
            var title       = parser.GetTitleAsync().GetAwaiter().GetResult();
            var headers     = parser.GetHeadersAsync().GetAwaiter().GetResult();
            var links       = parser.GetLinksAsync().GetAwaiter().GetResult();
            var tokens      = parser.GetTokensAsync(inLinkTexts).GetAwaiter().GetResult();

            var urlFile = new UrlFile
            {
                PublishDate       = (ulong)parser.GetPublishDateAsync().GetAwaiter().GetResult().ToBinary(),
                Charset           = fetchedFile.Charset,
                Content           = content,
                FileHash          = fetchedFile.FileHash,
                FilePath          = fetchedFile.FilePath,
                MimeType          = fetchedFile.MimeType,
                TextContent       = textContent,
                Title             = title,
                Url               = fetchedFile.Url,
                HeaderTotalLength = (uint)headers.Sum(o => o.Text.Length),
                HeaderCount       = (uint)headers.Sum(o => (6 - o.Level)),
                InLinkCount       = (uint)inLinkTexts.Count,
                InLinkTotalLength = (uint)inLinkTexts.Sum(o => o.Length),
                PageRank          = 0.1,
            };

            return(urlFile, links, tokens);
        }

Пример #2

Показать файл

Файл: SimpleFetcher.cs Проект: xuhongxu96/XiaoyaSearch

        /// <summary>
        /// Fetch the web content in the specific url
        /// </summary>
        /// <param name="url">Url in which to fetch the content</param>
        /// <returns>Local url to downloaded content</returns>
        public async Task <FetchedFile> FetchAsync(string url)
        {
            var uri = new Uri(url);

            if (uri.Scheme != "http" && uri.Scheme != "https")
            {
                throw new NotSupportedException("Not supported Uri Scheme: " + uri.Scheme);
            }

            // Target path to save downloaded web file
            var filePath = Path.Combine(mConfig.FetchDirectory, UrlHelper.UrlToFileName(url));

            var client = mConfig.NotUseProxyUrlRegex.IsMatch(url) ? mClientWithoutProxy : mClientWithProxy;

            var response = await client.GetAsync(url);

            var statusCode  = response.StatusCode;
            var contentType = response.Content.Headers.ContentType;

            if (statusCode != HttpStatusCode.Accepted &&
                statusCode != HttpStatusCode.OK)
            {
                mLogger.Log(nameof(SimpleFetcher), "Status: " + statusCode + " " + url);
                throw new IOException(statusCode.ToString());
            }

            if (mConfig.UsePhantomJS && (contentType == null || contentType.MediaType == "text/html"))
            {
                // If config is set to use PhantomJS and web content type is HTML,
                // use PhantomJS to fetch real web page content
                File.WriteAllText(filePath,
                                  FetchPageContentByPhantomJS(url, mConfig.PhantomJSDriverPath));
            }
            else
            {
                // Otherwise, directly save it if supported by parser
                if (contentType == null || UniversalFileParser.IsSupported(contentType.MediaType))
                {
                    using (Stream contentStream = await response.Content.ReadAsStreamAsync(),
                           stream = File.Create(filePath))
                    {
                        await contentStream.CopyToAsync(stream);
                    }
                }
                else
                {
                    throw new NotSupportedException("Not supported media type: " + contentType?.MediaType);
                }
            }

            #region Detect Content MIME Type
            var detectedContentType = MimeHelper.GetContentType(filePath);
            if ((detectedContentType == null ||
                 detectedContentType == "application/octet-stream" ||
                 detectedContentType == "inode/x-empty") &&
                contentType != null)
            {
                detectedContentType = contentType.MediaType;
            }

            if (detectedContentType == null)
            {
                File.Delete(filePath);
                mLogger.Log(nameof(SimpleFetcher), "Unknown MIME type: " + url);
                return(null);
            }

            if (!UniversalFileParser.IsSupported(detectedContentType))
            {
                File.Delete(filePath);
                mLogger.Log(nameof(SimpleFetcher), $"Deleted Not-Supported MIME type ({detectedContentType}): {url}");
                return(null);
            }
            #endregion

            var detectedCharset = DetectEncoding(filePath, contentType.CharSet);

            return(new FetchedFile
            {
                Url = url,
                FilePath = filePath,
                Charset = detectedCharset,
                MimeType = detectedContentType,
                FileHash = HashHelper.GetFileMd5(filePath),
            });
        }