private void processHtmlToApodDocObj(string filename, out ApodDoc apodDoc)
        {
            HtmlDocument htmlDocument = new HtmlDocument();

            htmlDocument.LoadHtml(File.ReadAllText(filename));
            apodDoc            = new ApodDoc();
            apodDoc.source_url = BaseUrl + Path.GetFileName(filename);
            apodDoc.created_on = getTimestampStringFrom(Path.GetFileName(filename));
            apodDoc.title      = htmlDocument.DocumentNode.SelectNodes("//title").Single().InnerText;
            var boldNodes     = htmlDocument.DocumentNode.SelectNodes("//b");
            var firstBoldNode = boldNodes?.First();

            apodDoc.name = firstBoldNode != null ? firstBoldNode.InnerText : apodDoc.title.Substring(apodDoc.title.IndexOf("-") + 2);

            try {
                apodDoc.body = htmlDocument.DocumentNode.SelectSingleNode("/html[1]/body[1]").InnerText;
            } catch (Exception) {
                apodDoc.body = htmlDocument.DocumentNode.InnerText;
            }

            var metaNodes    = htmlDocument.DocumentNode.SelectNodes("//meta[@name]");
            var keywordsNode = metaNodes?.FirstOrDefault(x => x.GetAttributeValue("name", null) == "keywords");

            if (keywordsNode != null)
            {
                apodDoc.keywords = keywordsNode.Attributes["content"].Value;
            }

            var imgHrefNode = htmlDocument.DocumentNode.SelectNodes("//a[@href]").ToList().Where(x => x.LastChild != null && x.LastChild.Name == "img").FirstOrDefault();

            if (imgHrefNode != null)
            {
                apodDoc.image_url = imgHrefNode.Attributes.First().Value;
            }
            else
            {
                var imgSrcNodes = htmlDocument.DocumentNode.SelectNodes("//img[@src]");
                if (imgSrcNodes != null)
                {
                    apodDoc.image_url = imgSrcNodes.ToList().First().Attributes.First().Value;
                }
            }
            if (apodDoc.image_url != null)
            {
                apodDoc.image_url = BaseUrl + apodDoc.image_url;
            }
            else
            {
                Console.WriteLine("Unable to find image for: " + apodDoc.source_url);
            }
        }
        private ApodDoc processApodHtmlFile(string filename)
        {
            ApodDoc apodDoc = new ApodDoc();

            try {
                processHtmlToApodDocObj(filename, out apodDoc);
            } catch (Exception e) {
                Console.WriteLine();
                Console.WriteLine("Error processing: " + Path.GetFileNameWithoutExtension(filename) + " , msg: " + e.Message);
                throw;
                ////if (apodDoc != null && apodDoc.source_url != null && apodDoc.body != null)
                ////    return apodDoc;
                ////else
                //    return null;
            }
            return(apodDoc);
        }
Esempio n. 3
0
        public void percolateQuery(string docName)
        {
            UriBuilder uriBuilder = new UriBuilder(EsScheme, EsHost, EsPort);

            uriBuilder.Path  = "nasa/_search";
            uriBuilder.Query = "pretty";
            var uri = uriBuilder.Uri;
            //Console.WriteLine("URI: " + uri.ToString());
            var queryStr = "{ \"query\" : { \"percolate\" : { \"field\" : \"query\", \"document\" : "
                           + ApodDoc.fromJsonFile(Path.Combine(jsonDocsPath, docName + ".json")).ToString()
                           + "} } }";
            var task = client.PostAsync(uri, new StringContent(queryStr, Encoding.UTF8, "application/json"));

            task.Wait();
            task.Result.EnsureSuccessStatusCode();
            var strTask = task.Result.Content.ReadAsStringAsync();

            strTask.Wait();
            Console.WriteLine("Status: " + task.Result.StatusCode + " , content: " + strTask.Result);
        }
Esempio n. 4
0
        internal void index(int batchSize, bool recreateIndex = false)
        {
            try {
                if (!Directory.Exists(jsonDocsPath))
                {
                    Directory.CreateDirectory(jsonDocsPath);
                }

                if (recreateIndex)
                {
                    deleteIndex(NasaIndexName);
                }

                var filesToIndex = Directory.GetFiles(jsonDocsPath, "*.json").ToList();
                Console.WriteLine("Beginning indexing of " + filesToIndex.Count() + " documents.  batchSize: " + batchSize);
                int numDocsProcessed = 0;
                var docs             = new List <ApodDoc>();
                foreach (var jsonDoc in filesToIndex)
                {
                    ApodDoc apodObj = ApodDoc.fromJsonFile(jsonDoc);
                    docs.Add(apodObj);
                    ++numDocsProcessed;
                    if (numDocsProcessed % batchSize == 0)
                    {
                        performIndex(docs);
                        docs.Clear();
                        //return;
                        Console.Write(".");
                    }
                }
                performIndex(docs);
            } catch (Exception e) {
                Console.WriteLine("Error indexing documents: " + e.Message);
            }
            Console.WriteLine("Finished indexing.");
        }
        public void getImages(string apodJsonDocsPath, string apodImagesPath, int delayMillis = 1300)
        {
            Console.WriteLine("Downloading NASA's Astronomy Picture of the Day's to: " + apodImagesPath);
            try {
                int i          = 0;
                int numSkipped = 0;
                if (!Directory.Exists(apodImagesPath))
                {
                    Directory.CreateDirectory(apodImagesPath);
                }

                var proc = new Process();
                foreach (string docFilePath in Directory.GetFiles(apodJsonDocsPath, "*.json"))
                {
                    ApodDoc apodObj = ApodDoc.fromJsonFile(docFilePath);
                    if (string.IsNullOrWhiteSpace(apodObj.image_url))
                    {
                        Console.WriteLine("image_url is empty for: " + apodObj.source_url + " , skipping.");
                        continue;
                    }
                    var imgOutputPath = Path.Combine(apodImagesPath, Path.GetFileNameWithoutExtension(docFilePath) + "_" + apodObj.image_url.Substring(apodObj.image_url.LastIndexOf('/') + 1));
                    if (imgOutputPath.LastIndexOf(".") < 0)
                    {
                        Console.WriteLine("ImgOutputPath: " + imgOutputPath + " , image_url: " + apodObj.image_url);
                        System.Diagnostics.Debugger.Break();
                    }
                    if (File.Exists(imgOutputPath))
                    {
                        if (numSkipped == 0)
                        {
                            Console.WriteLine("Skipping already processed files.");
                        }
                        else if (numSkipped % 50 == 0)
                        {
                            Console.Write(".");
                        }
                        ++numSkipped;
                        continue;
                    }
                    if (numSkipped > 0)
                    {
                        numSkipped = 0;
                        Console.WriteLine();
                        Console.WriteLine("Processing missing pages...");
                    }

                    var processStartInfo = new ProcessStartInfo(@"C:\Program Files\Git\mingw64\bin\wget.exe", apodObj.image_url + " -O " + imgOutputPath);
                    processStartInfo.CreateNoWindow  = true;
                    processStartInfo.UseShellExecute = false;


                    proc.StartInfo = processStartInfo;

                    proc.Start();
                    proc.WaitForExit();

                    Thread.Sleep(delayMillis);

                    ++i;
                    // if (i == 5) return;
                }
            } catch (Exception e) {
                Console.WriteLine("Error getting images: " + e.Message);
                return;
            }
        }