/// <summary> /// Crawl and parse data from a URL. /// </summary> /// <param name="url">Source URL.</param> /// <returns>Parse result.</returns> public ParseResult ParseFromUrl(string url) { if (String.IsNullOrEmpty(url)) { throw new ArgumentNullException(nameof(url)); } ParseResult ret = new ParseResult(); ret.Xml = new ParseResult.XmlParseResult(); HttpCrawler crawler = new HttpCrawler(url); CrawlResult cr = crawler.Get(); if (!cr.Success) { ret.Time.End = DateTime.UtcNow; return(ret); } byte[] sourceData = cr.Data; string sourceContent = Encoding.UTF8.GetString(sourceData); return(ProcessSourceContent(sourceContent)); }
public async Task <int> CrawlItems() { int successCount = 0, processedCount = 0, alreadyExistCount = 0, emptyCount = 0, errorCount = 0; if (this.crawler == null) { this.crawler = new HttpCrawler(); } if (!Directory.Exists(CacheFolder)) { Directory.CreateDirectory(CacheFolder); } for (var faqId = MaxFaqId; faqId >= MinFaqId; faqId--) { var path = Path.Combine(CacheFolder, $"{faqId}.json"); if (File.Exists(path)) { alreadyExistCount++; continue; } var url = string.Format(UrlPattern, faqId); var requestItem = new RequestItem(url); var responseItem = await this.crawler.CrawlAsync(requestItem); if (responseItem.StatusCode == System.Net.HttpStatusCode.OK) { var jsonFaq = JToken.Parse(responseItem.Body); var respondFaqId = (string)jsonFaq["faqId"]; if (respondFaqId != null) { File.WriteAllText(path, responseItem.Body); processedCount++; } else { emptyCount++; } } else { AppendErrorFaq(faqId, responseItem); errorCount++; } if (faqId % 100 == 0) { Trace.TraceInformation($"Processed: {processedCount}, exists: {alreadyExistCount}, empty: {emptyCount}, error: {errorCount}"); } } return(successCount); }
/// <summary> /// Crawl the specified URL using HTTP. /// </summary> /// <param name="url">URL.</param> /// <returns>Result.</returns> public CrawlResult CrawlWebpage(string url) { if (String.IsNullOrEmpty(url)) { throw new ArgumentNullException(nameof(url)); } HttpCrawler hc = new HttpCrawler(url); CrawlResult hcr = hc.Get(); return(hcr); }
private bool DownloadImage(Image myImage) { BaseCrawler crawler = null; string sourceUrl = myImage.SourceUrl.Trim(); if (sourceUrl.StartsWith(@"https://cosmos", StringComparison.InvariantCultureIgnoreCase)) // cosmos path { crawler = new CosmosCrawler(FeedUriTypes.ShareFolder, sourceUrl, myImage.DownloadImageName, DateTime.MinValue); } else if (sourceUrl.StartsWith("http://") || sourceUrl.StartsWith("https://")) { ProxyType proxyType; if (!Enum.TryParse <ProxyType>(proxy, true, out proxyType)) { proxyType = ProxyType.NULL; } crawler = new HttpCrawler(FeedUriTypes.Http, sourceUrl, myImage.DownloadImageName, DateTime.MinValue, null, proxyType); } else // wrong image URI { ImageLogger.LogMessage(this.log, EventType.Warning, "Cannot identify this image's URI: {0}", sourceUrl); return(false); } if (crawler != null) { if (crawler.Crawl() != BuildResults.Crawler_Succeed) { ImageLogger.LogMessage(this.log, EventType.Warning, "Exception when download image {0}", sourceUrl); return(false); } } return(true); }
static void HttpCrawler() { string url = Common.InputString("URL:", "http://127.0.0.1/", true); if (String.IsNullOrEmpty(url)) { return; } HttpCrawler hc = new HttpCrawler(url); hc.IgnoreCertificateErrors = true; hc.Method = (HttpMethod)(Enum.Parse(typeof(HttpMethod), Common.InputString("Method:", "GET", false))); hc.Username = Common.InputString("Username:"******"Password:"******"console")) { EnumerateCrawlResult(cr); } ParseCrawlResult(cr); }
static void HttpCrawler() { string url = Common.InputString("URL:", "http://127.0.0.1/", true); if (String.IsNullOrEmpty(url)) { return; } HttpCrawler hc = new HttpCrawler(url); hc.IgnoreCertificateErrors = true; hc.Method = (HttpMethod)(Enum.Parse(typeof(HttpMethod), Common.InputString("Method:", "GET", false))); hc.Username = Common.InputString("Username:"******"Password:"******"Success : " + cr.Success); Console.WriteLine("Start time : " + cr.Time.Start.ToString()); Console.WriteLine("End time : " + cr.Time.End.ToString()); Console.WriteLine("Total ms : " + cr.Time.TotalMs.ToString() + "ms"); Console.WriteLine("Content length : " + cr.ContentLength + " bytes"); if (cr.Http != null) { Console.WriteLine("Status code : " + cr.Http.StatusCode); if (cr.Http.Headers != null && cr.Http.Headers.Count > 0) { Console.WriteLine("Headers : "); foreach (KeyValuePair <string, string> curr in cr.Http.Headers) { Console.WriteLine(" " + curr.Key + ": " + curr.Value); } } } Console.WriteLine("Data :" + Environment.NewLine + Encoding.UTF8.GetString(cr.Data)); }
private static async Task PostParse(RequestMetadata md) { string header = "[Komodo.Server] " + md.Http.Request.Source.IpAddress + ":" + md.Http.Request.Source.Port + " PostParse "; if (String.IsNullOrEmpty(md.Params.Type)) { _Logging.Warn(header + "no document type supplied"); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Supply 'type' [json/xml/html/sql/text] in querystring.", null, null).ToJson(true)); return; } byte[] data = null; CrawlResult crawlResult = null; ParseResult parseResult = null; HttpCrawler httpCrawler = null; FileCrawler fileCrawler = null; if (!String.IsNullOrEmpty(md.Params.Url)) { #region Crawl-URL switch (md.Params.Type.ToLower()) { case "html": httpCrawler = new HttpCrawler(md.Params.Url); crawlResult = httpCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied URL.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; HtmlParser htmlParser = new HtmlParser(); parseResult = htmlParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied URL.", null, parseResult).ToJson(true)); return; } break; case "json": httpCrawler = new HttpCrawler(md.Params.Url); crawlResult = httpCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied URL.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; JsonParser jsonParser = new JsonParser(); parseResult = jsonParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied URL.", null, parseResult).ToJson(true)); return; } break; case "text": httpCrawler = new HttpCrawler(md.Params.Url); crawlResult = httpCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied URL.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; TextParser textParser = new TextParser(); parseResult = textParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied URL.", null, parseResult).ToJson(true)); return; } break; case "xml": httpCrawler = new HttpCrawler(md.Params.Url); crawlResult = httpCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied URL.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; XmlParser xmlParser = new XmlParser(); parseResult = xmlParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied URL.", null, parseResult).ToJson(true)); return; } break; default: _Logging.Warn(header + "invalid document type for processing via URL " + md.Params.Url); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Invalid document type.", null, null).ToJson(true)); return; } md.Http.Response.StatusCode = 200; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(Common.SerializeJson(parseResult, md.Params.Pretty)); return; #endregion } else if (!String.IsNullOrEmpty(md.Params.Filename)) { #region Filename switch (md.Params.Type.ToLower()) { case "html": fileCrawler = new FileCrawler(md.Params.Filename); crawlResult = fileCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl filename " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied filename.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; HtmlParser htmlParser = new HtmlParser(); parseResult = htmlParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from file " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied filename.", null, parseResult).ToJson(true)); return; } break; case "json": fileCrawler = new FileCrawler(md.Params.Filename); crawlResult = fileCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl filename " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied filename.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; JsonParser jsonParser = new JsonParser(); parseResult = jsonParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from file " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied filename.", null, parseResult).ToJson(true)); return; } break; case "text": fileCrawler = new FileCrawler(md.Params.Filename); crawlResult = fileCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl filename " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied filename.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; TextParser textParser = new TextParser(); parseResult = textParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from file " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied filename.", null, parseResult).ToJson(true)); return; } break; case "xml": fileCrawler = new FileCrawler(md.Params.Filename); crawlResult = fileCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl filename " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied filename.", null, crawlResult).ToJson(true)); return; } data = crawlResult.Data; XmlParser xmlParser = new XmlParser(); parseResult = xmlParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from file " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied filename.", null, parseResult).ToJson(true)); return; } break; default: _Logging.Warn(header + "invalid document type for processing via filename " + md.Params.Filename); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Invalid document type.", null, null).ToJson(true)); return; } md.Http.Response.StatusCode = 200; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(Common.SerializeJson(parseResult, md.Params.Pretty)); return; #endregion } else if (md.Params.Type.ToLower().Equals("sql")) { #region Query if (md.Http.Request.Data == null || md.Http.Request.ContentLength < 1) { _Logging.Warn(header + "no query found in payload"); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "No SQL query in request payload.", null, null).ToJson(true)); return; } DbSettings dbSettings = new DbSettings(md.Params.DbType, md.Params.DbServer, md.Params.DbPort, md.Params.DbUser, md.Params.DbPass, md.Params.DbInstance, md.Params.DbName); SqlCrawler sqlCrawler = new SqlCrawler(dbSettings, Encoding.UTF8.GetString(Common.StreamToBytes(md.Http.Request.Data))); crawlResult = sqlCrawler.Get(); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl database " + md.Params.DbName); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl specified database.", null, crawlResult).ToJson(true)); return; } SqlParser sqlParser = new SqlParser(); parseResult = sqlParser.Parse(crawlResult.DataTable); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from database " + md.Params.DbName); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from specified database.", null, parseResult).ToJson(true)); return; } md.Http.Response.StatusCode = 200; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(Common.SerializeJson(parseResult, md.Params.Pretty)); return; #endregion } else if (md.Http.Request.Data != null && md.Http.Request.ContentLength > 0) { #region Supplied-Data data = Common.StreamToBytes(md.Http.Request.Data); switch (md.Params.Type.ToLower()) { case "html": HtmlParser htmlParser = new HtmlParser(); parseResult = htmlParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from supplied data"); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied data.", null, parseResult).ToJson(true)); return; } break; case "json": JsonParser jsonParser = new JsonParser(); parseResult = jsonParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from supplied data"); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied data.", null, parseResult).ToJson(true)); return; } break; case "text": TextParser textParser = new TextParser(); parseResult = textParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from supplied data"); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied data.", null, parseResult).ToJson(true)); return; } break; case "xml": XmlParser xmlParser = new XmlParser(); parseResult = xmlParser.ParseBytes(data); if (!parseResult.Success) { _Logging.Warn(header + "failed to parse data from supplied data"); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to parse data from supplied data.", null, parseResult).ToJson(true)); return; } break; default: _Logging.Warn(header + "invalid document type for processing via data"); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Invalid document type supplied.", null, null).ToJson(true)); return; } md.Http.Response.StatusCode = 200; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(Common.SerializeJson(parseResult, md.Params.Pretty)); return; #endregion } else { #region Unknown _Logging.Warn(header + "unable to derive data source from request"); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Unable to derive data source from request.", null, null).ToJson(true)); return; #endregion } }
private static async Task PostIndexDocument(RequestMetadata md) { #region Variables string header = "[Komodo.Server] " + md.Http.Request.Source.IpAddress + ":" + md.Http.Request.Source.Port + " PostIndexDocument "; string tempFile = _Settings.TempStorage.Disk.Directory + Guid.NewGuid().ToString(); string indexName = md.Http.Request.Url.Elements[0]; string sourceGuid = null; if (md.Http.Request.Url.Elements.Length == 2) { sourceGuid = md.Http.Request.Url.Elements[1]; } #endregion #region Check-Index-Existence Index index = _Daemon.GetIndex(indexName); if (index == null) { _Logging.Warn(header + "index " + indexName + " does not exist"); md.Http.Response.StatusCode = 404; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(404, "Unknown index.", null, null).ToJson(true)); return; } #endregion #region Check-Supplied-GUID if (!String.IsNullOrEmpty(sourceGuid)) { if (_Daemon.SourceDocumentExists(indexName, sourceGuid)) { _Logging.Warn(header + "document " + indexName + "/" + sourceGuid + " already exists"); md.Http.Response.StatusCode = 409; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(409, "Requested GUID already exists.", null, null).ToJson(true)); return; } } #endregion #region Retrieve-DocType-from-QS if (String.IsNullOrEmpty(md.Params.Type)) { _Logging.Warn(header + "no 'type' value found in querystring"); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Supply 'type' [json/xml/html/sql/text] in querystring.", null, null).ToJson(true)); return; } DocType docType = DocType.Json; switch (md.Params.Type) { case "json": docType = DocType.Json; break; case "xml": docType = DocType.Xml; break; case "html": docType = DocType.Html; break; case "sql": docType = DocType.Sql; break; case "text": docType = DocType.Text; break; case "unknown": docType = DocType.Unknown; break; default: _Logging.Warn(header + "invalid 'type' value found in querystring: " + md.Params.Type); md.Http.Response.StatusCode = 400; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Supply 'type' [json/xml/html/sql/text] in querystring.", null, null).ToJson(true)); return; } #endregion try { #region Write-Temp-File long contentLength = 0; string md5 = null; CrawlResult crawlResult = null; if (!String.IsNullOrEmpty(md.Params.Url) || !String.IsNullOrEmpty(md.Params.Filename)) { #region Crawl if (!String.IsNullOrEmpty(md.Params.Url)) { HttpCrawler httpCrawler = new HttpCrawler(md.Params.Url); crawlResult = httpCrawler.Download(tempFile); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl URL " + md.Params.Url); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied URL.", null, crawlResult).ToJson(true)); return; } contentLength = crawlResult.ContentLength; md5 = Common.Md5File(tempFile); } else { FileCrawler fileCrawler = new FileCrawler(md.Params.Filename); crawlResult = fileCrawler.Download(tempFile); if (!crawlResult.Success) { _Logging.Warn(header + "failed to crawl filename " + md.Params.Filename); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(400, "Failed to crawl supplied filename.", null, crawlResult).ToJson(true)); return; } contentLength = crawlResult.ContentLength; md5 = Common.Md5(tempFile); } #endregion } else { using (FileStream fs = new FileStream(tempFile, FileMode.Create, FileAccess.ReadWrite)) { await md.Http.Request.Data.CopyToAsync(fs); } contentLength = md.Http.Request.ContentLength; md5 = Common.Md5File(tempFile); } #endregion #region Build-Source-Document string sourceUrl = null; if (!String.IsNullOrEmpty(md.Params.Url)) { sourceUrl = md.Params.Url; } else if (!String.IsNullOrEmpty(md.Params.Filename)) { sourceUrl = md.Params.Filename; } List <string> tags = null; if (!String.IsNullOrEmpty(md.Params.Tags)) { tags = Common.CsvToStringList(md.Params.Tags); } SourceDocument src = new SourceDocument( sourceGuid, md.User.GUID, index.GUID, md.Params.Name, md.Params.Title, tags, docType, sourceUrl, md.Http.Request.ContentType, contentLength, md5); #endregion if (!md.Params.Async) { #region Sync IndexResult result = await _Daemon.AddDocument( indexName, src, Common.ReadBinaryFile(tempFile), new ParseOptions(), !md.Params.Bypass); if (!result.Success) { _Logging.Warn(header + "unable to store document in index " + indexName); md.Http.Response.StatusCode = 500; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(new ErrorResponse(500, "Unable to store document in index '" + indexName + "'.", null, result).ToJson(true)); return; } md.Http.Response.StatusCode = 200; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(Common.SerializeJson(result, md.Params.Pretty)); return; #endregion } else { #region Async IndexResult result = new IndexResult(); result.Success = true; result.GUID = src.GUID; result.Type = docType; result.ParseResult = null; result.Time = null; Task unawaited = _Daemon.AddDocument( index.Name, src, Common.ReadBinaryFile(tempFile), new ParseOptions(), !md.Params.Bypass); md.Http.Response.StatusCode = 200; md.Http.Response.ContentType = "application/json"; await md.Http.Response.Send(Common.SerializeJson(result, md.Params.Pretty)); return; #endregion } } finally { if (File.Exists(tempFile)) { File.Delete(tempFile); } } }