private void processHtmlToApodDocObj(string filename, out ApodDoc apodDoc) { HtmlDocument htmlDocument = new HtmlDocument(); htmlDocument.LoadHtml(File.ReadAllText(filename)); apodDoc = new ApodDoc(); apodDoc.source_url = BaseUrl + Path.GetFileName(filename); apodDoc.created_on = getTimestampStringFrom(Path.GetFileName(filename)); apodDoc.title = htmlDocument.DocumentNode.SelectNodes("//title").Single().InnerText; var boldNodes = htmlDocument.DocumentNode.SelectNodes("//b"); var firstBoldNode = boldNodes?.First(); apodDoc.name = firstBoldNode != null ? firstBoldNode.InnerText : apodDoc.title.Substring(apodDoc.title.IndexOf("-") + 2); try { apodDoc.body = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]").InnerText; } catch (Exception) { apodDoc.body = htmlDocument.DocumentNode.InnerText; } var metaNodes = htmlDocument.DocumentNode.SelectNodes("//meta[@name]"); var keywordsNode = metaNodes?.FirstOrDefault(x => x.GetAttributeValue("name", null) == "keywords"); if (keywordsNode != null) { apodDoc.keywords = keywordsNode.Attributes["content"].Value; } var imgHrefNode = htmlDocument.DocumentNode.SelectNodes("//a[@href]").ToList().Where(x => x.LastChild != null && x.LastChild.Name == "img").FirstOrDefault(); if (imgHrefNode != null) { apodDoc.image_url = imgHrefNode.Attributes.First().Value; } else { var imgSrcNodes = htmlDocument.DocumentNode.SelectNodes("//img[@src]"); if (imgSrcNodes != null) { apodDoc.image_url = imgSrcNodes.ToList().First().Attributes.First().Value; } } if (apodDoc.image_url != null) { apodDoc.image_url = BaseUrl + apodDoc.image_url; } else { Console.WriteLine("Unable to find image for: " + apodDoc.source_url); } }
private ApodDoc processApodHtmlFile(string filename) { ApodDoc apodDoc = new ApodDoc(); try { processHtmlToApodDocObj(filename, out apodDoc); } catch (Exception e) { Console.WriteLine(); Console.WriteLine("Error processing: " + Path.GetFileNameWithoutExtension(filename) + " , msg: " + e.Message); throw; ////if (apodDoc != null && apodDoc.source_url != null && apodDoc.body != null) //// return apodDoc; ////else // return null; } return(apodDoc); }
public void percolateQuery(string docName) { UriBuilder uriBuilder = new UriBuilder(EsScheme, EsHost, EsPort); uriBuilder.Path = "nasa/_search"; uriBuilder.Query = "pretty"; var uri = uriBuilder.Uri; //Console.WriteLine("URI: " + uri.ToString()); var queryStr = "{ \"query\" : { \"percolate\" : { \"field\" : \"query\", \"document\" : " + ApodDoc.fromJsonFile(Path.Combine(jsonDocsPath, docName + ".json")).ToString() + "} } }"; var task = client.PostAsync(uri, new StringContent(queryStr, Encoding.UTF8, "application/json")); task.Wait(); task.Result.EnsureSuccessStatusCode(); var strTask = task.Result.Content.ReadAsStringAsync(); strTask.Wait(); Console.WriteLine("Status: " + task.Result.StatusCode + " , content: " + strTask.Result); }
internal void index(int batchSize, bool recreateIndex = false) { try { if (!Directory.Exists(jsonDocsPath)) { Directory.CreateDirectory(jsonDocsPath); } if (recreateIndex) { deleteIndex(NasaIndexName); } var filesToIndex = Directory.GetFiles(jsonDocsPath, "*.json").ToList(); Console.WriteLine("Beginning indexing of " + filesToIndex.Count() + " documents. batchSize: " + batchSize); int numDocsProcessed = 0; var docs = new List <ApodDoc>(); foreach (var jsonDoc in filesToIndex) { ApodDoc apodObj = ApodDoc.fromJsonFile(jsonDoc); docs.Add(apodObj); ++numDocsProcessed; if (numDocsProcessed % batchSize == 0) { performIndex(docs); docs.Clear(); //return; Console.Write("."); } } performIndex(docs); } catch (Exception e) { Console.WriteLine("Error indexing documents: " + e.Message); } Console.WriteLine("Finished indexing."); }
public void getImages(string apodJsonDocsPath, string apodImagesPath, int delayMillis = 1300) { Console.WriteLine("Downloading NASA's Astronomy Picture of the Day's to: " + apodImagesPath); try { int i = 0; int numSkipped = 0; if (!Directory.Exists(apodImagesPath)) { Directory.CreateDirectory(apodImagesPath); } var proc = new Process(); foreach (string docFilePath in Directory.GetFiles(apodJsonDocsPath, "*.json")) { ApodDoc apodObj = ApodDoc.fromJsonFile(docFilePath); if (string.IsNullOrWhiteSpace(apodObj.image_url)) { Console.WriteLine("image_url is empty for: " + apodObj.source_url + " , skipping."); continue; } var imgOutputPath = Path.Combine(apodImagesPath, Path.GetFileNameWithoutExtension(docFilePath) + "_" + apodObj.image_url.Substring(apodObj.image_url.LastIndexOf('/') + 1)); if (imgOutputPath.LastIndexOf(".") < 0) { Console.WriteLine("ImgOutputPath: " + imgOutputPath + " , image_url: " + apodObj.image_url); System.Diagnostics.Debugger.Break(); } if (File.Exists(imgOutputPath)) { if (numSkipped == 0) { Console.WriteLine("Skipping already processed files."); } else if (numSkipped % 50 == 0) { Console.Write("."); } ++numSkipped; continue; } if (numSkipped > 0) { numSkipped = 0; Console.WriteLine(); Console.WriteLine("Processing missing pages..."); } var processStartInfo = new ProcessStartInfo(@"C:\Program Files\Git\mingw64\bin\wget.exe", apodObj.image_url + " -O " + imgOutputPath); processStartInfo.CreateNoWindow = true; processStartInfo.UseShellExecute = false; proc.StartInfo = processStartInfo; proc.Start(); proc.WaitForExit(); Thread.Sleep(delayMillis); ++i; // if (i == 5) return; } } catch (Exception e) { Console.WriteLine("Error getting images: " + e.Message); return; } }