Пример #1
0
        protected async Task <string> processTxtAsync(string url)
        {
            // instance or static variable
            HttpClient client = new HttpClient();

            // get answer in non-blocking way
            using (var response = await client.GetAsync(url))
            {
                using (var content = response.Content)
                {
                    // read answer in non-blocking way
                    var result = await content.ReadAsStringAsync();

                    var document = new HtmlDocument();
                    document.LoadHtml(result);
                    var nodes = document.DocumentNode.InnerText; //.SelectNodes("//div");
                    //Some work with page....
                    //List<string> txts = new List<string>();
                    //foreach (HtmlNode node in document.DocumentNode.SelectNodes("//text()"))
                    //{
                    //    txts.Add(node.InnerText);
                    //}

                    //replace all special characters
                    string clean = StopWordsRemover.RemoveStopwords(nodes.Replace("\n", " ").Replace("\t", " ").Trim());
                    clean = Regex.Replace(clean, @"[\:|\;|\.]", "");

                    return(clean);
                }
            }
        }
Пример #2
0
        static void Main(string[] args)
        {
            Console.WriteLine("Hello World!");
            string[] sw = new[] { "the", "a", "and" };

            StopWordsRemover swr = new StopWordsRemover();

            var w = swr.RemoveStopWords("the dog ate a bone");
        }
Пример #3
0
        static void Main(string[] args)
        {
            const string kText =
                "•	Implement brand-new programs for Grants and Scholarships by working with Product Owner, BAs, and QAs teams"+
                "•	Rewrite the legacy Grants and Scholarships systems using new architecture with.Net Core, MVC, &Web API Core."+
                "	Follow Agile and Scrum Methodology with two - week sprint, grooming, tasks planning, and so on."+
                "•	Presentations / Knowledge Shares with Development Team for any new approaches and technologies."+
                " Financial Institutions Department" +
                "•	Enhance Request / Ticket Systems and Imaging Systems by adding new features and customizations."+
                " •	Refactor the entire of code based using Repository, Domain, Services, and Dependency Injections." +
                "•	Fix and improve UI by using JavaScript, jQuery, and MVC View Razors."+
                "•	Modify and Create SQL Stored Procedures to Support Applications."+
                "•	Enhance Imaging Systems by adding Auto Email feature and customizing UI."+
                "•	Both Request and Imaging Systems’ new features were deployed to PROD server."+
                "•	Tools: Visual Studio 2019 / 2017, TFS 2013, and SQL Management Studio 2014.";

            Text.NGrams ngs = new Text.NGrams();

            Console.WriteLine("N-Grams from 'raw' text...");
            var ngrams = ngs.GenerateNGrams(kText);

            foreach (var nGram in ngrams)
            {
                Console.WriteLine(nGram.ToString());
            }

            Console.WriteLine("========================================");

            Console.WriteLine("N-Grams from 'normalize and stop words removed' text...");
            var normalizer       = new TextNormalizer();
            var normalizedText   = normalizer.NormalizeText(kText);
            var stopWordsRemover = new StopWordsRemover();
            var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText);

            Console.WriteLine("Normalized and Stop words removed text:");
            Console.WriteLine(stopWordsRemoved);
            Console.WriteLine("\n\n\n");
            var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved);

            foreach (var nGram in ngrams2)
            {
                Console.WriteLine(nGram.ToString());
            }

            PrintEnd();
        }
Пример #4
0
        // ReSharper disable once UnusedParameter.Local
        static void Main(string[] args)
        {
            const string kText =
                "this is a test. this is only a test.  if this had been an actual program it would not have been so dumb";
            NGrams ngs = new NGrams(3);

            Console.WriteLine("N-Grams from 'raw' text...");
            var ngrams = ngs.GenerateNGramsStrings(kText);

            foreach (var nGram in ngrams)
            {
                Console.WriteLine(nGram.ToString());
            }

            Console.WriteLine("========================================");

            Console.WriteLine("N-Grams from 'normalize and stop words removed' text...");
            var normalizer       = new TextNormalizer();
            var normalizedText   = normalizer.NormalizeText(kText);
            var stopWordsRemover = new StopWordsRemover();
            var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText);

            Console.WriteLine("Normalized and Stop words removed text:");
            Console.WriteLine(stopWordsRemoved);
            Console.WriteLine("\n\n\n");

            var normalizedWithoutStopWordsRemoved = normalizer.NormalizeText(kText);

            Console.WriteLine("Normalized and Stop words NOT removed text:");
            Console.WriteLine(normalizedWithoutStopWordsRemoved);
            Console.WriteLine("\n\n\n");

            var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved);

            Console.WriteLine("N-Grams from 'normalized' text...");
            foreach (var nGram in ngrams2)
            {
                Console.WriteLine(nGram.ToString());
            }

            PrintEnd();
        }
        private SimpleDataSet CreateIndexItemFromJob(Job job, string indexType)
        {
            if (job.DatePublished > DateTime.UtcNow)
            {
                LogHelper.Info <BaseJobsIndexer>($"Ignoring job '{job.Id}' because it's publish date {job.DatePublished.ToIso8601DateTime()} is in the future.");
                return(null);
            }

            LogHelper.Info <BaseJobsIndexer>($"Building Examine index item for job '{job.Id}'");

            var salary = job.Salary.SalaryRange;
            var salaryWithStopWords = salary;

            if (!String.IsNullOrEmpty(salary))
            {
                if (TagSanitiser != null)
                {
                    salary = TagSanitiser.StripTags(salary);
                }
                if (StopWordsRemover != null)
                {
                    salary = StopWordsRemover.Filter(salary);
                }
            }
            var simpleDataSet = new SimpleDataSet {
                NodeDefinition = new IndexedNode(), RowData = new Dictionary <string, string>()
            };

            simpleDataSet.NodeDefinition.NodeId = job.Id;
            simpleDataSet.NodeDefinition.Type   = indexType;
            simpleDataSet.RowData.Add("id", job.Id.ToString(CultureInfo.InvariantCulture));
            simpleDataSet.RowData.Add("reference", job.Reference);
            simpleDataSet.RowData.Add("numberOfPositions", job.NumberOfPositions?.ToString(CultureInfo.CurrentCulture));
            simpleDataSet.RowData.Add("title", StopWordsRemover != null ? StopWordsRemover.Filter(job.JobTitle) : job.JobTitle);
            simpleDataSet.RowData.Add("titleDisplay", job.JobTitle);
            simpleDataSet.RowData.Add("logoUrl", job.LogoUrl?.ToString());
            simpleDataSet.RowData.Add("organisation", StopWordsRemover != null ? StopWordsRemover.Filter(job.Organisation) : job.Organisation);
            simpleDataSet.RowData.Add("organisationDisplay", job.Organisation);
            simpleDataSet.RowData.Add("salary", salary);
            simpleDataSet.RowData.Add("salaryDisplay", salaryWithStopWords); // so that it's not displayed with stop words removed
            simpleDataSet.RowData.Add("salaryRange", StopWordsRemover != null ? StopWordsRemover.Filter(job.Salary.SearchRange) : job.Salary.SearchRange);
            simpleDataSet.RowData.Add("salaryMin", job.Salary.MinimumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty);
            simpleDataSet.RowData.Add("salaryMax", job.Salary.MaximumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty);
            simpleDataSet.RowData.Add("salarySort", (job.Salary.MinimumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty) + " " + (job.Salary.MaximumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty) + " " + (StopWordsRemover != null ? StopWordsRemover.Filter(job.Salary.SalaryRange) : job.Salary.SalaryRange));
            simpleDataSet.RowData.Add("hourlyRate", job.Salary.MinimumHourlyRate?.ToString(CultureInfo.CurrentCulture));
            simpleDataSet.RowData.Add("hoursPerWeek", job.WorkPattern.HoursPerWeek?.ToString(CultureInfo.CurrentCulture));
            simpleDataSet.RowData.Add("jobType", StopWordsRemover != null ? StopWordsRemover.Filter(job.JobType) : job.JobType);
            simpleDataSet.RowData.Add("jobTypeDisplay", job.JobType);
            simpleDataSet.RowData.Add("contractType", StopWordsRemover != null ? StopWordsRemover.Filter(job.ContractType) : job.ContractType);
            simpleDataSet.RowData.Add("department", StopWordsRemover != null ? StopWordsRemover.Filter(job.Department) : job.Department);
            simpleDataSet.RowData.Add("departmentDisplay", job.Department);
            simpleDataSet.RowData.Add("datePublished", job.DatePublished.ToIso8601DateTime());

            if (job.ClosingDate.HasValue)
            {
                simpleDataSet.RowData.Add("closingDate", job.ClosingDate.Value.ToIso8601DateTime());
                simpleDataSet.RowData.Add("closingDateDisplay", job.ClosingDate.Value.ToIso8601DateTime());
            }
            else
            {
                // Examine queries are simpler if there is always a closing date, so set a far-future closing date to represent never,
                // and don't have a version of the closing date for display.
                simpleDataSet.RowData.Add("closingDate", DateTime.MaxValue.ToIso8601DateTime());
            }

            var workPatternList = string.Join(", ", job.WorkPattern.WorkPatterns.ToArray <string>());

            simpleDataSet.RowData.Add("workPattern", workPatternList);

            var locationsList = string.Join(", ", job.Locations.ToArray <string>());

            simpleDataSet.RowData.Add("location", StopWordsRemover != null ? StopWordsRemover.Filter(locationsList) : locationsList);
            simpleDataSet.RowData.Add("locationDisplay", locationsList); // because Somewhere-on-Sea needs to lose the "on" for searching but keep it for display


            if (job.AdvertHtml != null)
            {
                var fullText = job.AdvertHtml.ToHtmlString();
                if (TagSanitiser != null)
                {
                    fullText = TagSanitiser.StripTags(fullText);
                }

                // Append other fields as keywords, otherwise a search term that's a good match will not be found if it has terms from two fields,
                // eg Job Title (full time)
                const string space = " ";
                fullText = new StringBuilder(fullText)
                           .Append(space).Append(job.Reference)
                           .Append(space).Append(job.JobTitle)
                           .Append(space).Append(job.Organisation)
                           .Append(space).Append(job.Locations)
                           .Append(space).Append(job.JobType)
                           .Append(space).Append(job.ContractType)
                           .Append(space).Append(job.Department)
                           .Append(space).Append(job.WorkPattern.ToString())
                           .ToString();

                simpleDataSet.RowData.Add("fullText", fullText);
                simpleDataSet.RowData.Add("fullHtml", job.AdvertHtml.ToHtmlString());
            }
            if (job.AdditionalInformationHtml != null)
            {
                simpleDataSet.RowData.Add("additionalInfo", job.AdditionalInformationHtml.ToHtmlString());
            }
            if (job.EqualOpportunitiesHtml != null)
            {
                simpleDataSet.RowData.Add("equalOpportunities", job.EqualOpportunitiesHtml.ToHtmlString());
            }
            simpleDataSet.RowData.Add("applyUrl", job.ApplyUrl?.ToString());

            return(simpleDataSet);
        }