protected async Task <string> processTxtAsync(string url) { // instance or static variable HttpClient client = new HttpClient(); // get answer in non-blocking way using (var response = await client.GetAsync(url)) { using (var content = response.Content) { // read answer in non-blocking way var result = await content.ReadAsStringAsync(); var document = new HtmlDocument(); document.LoadHtml(result); var nodes = document.DocumentNode.InnerText; //.SelectNodes("//div"); //Some work with page.... //List<string> txts = new List<string>(); //foreach (HtmlNode node in document.DocumentNode.SelectNodes("//text()")) //{ // txts.Add(node.InnerText); //} //replace all special characters string clean = StopWordsRemover.RemoveStopwords(nodes.Replace("\n", " ").Replace("\t", " ").Trim()); clean = Regex.Replace(clean, @"[\:|\;|\.]", ""); return(clean); } } }
static void Main(string[] args) { Console.WriteLine("Hello World!"); string[] sw = new[] { "the", "a", "and" }; StopWordsRemover swr = new StopWordsRemover(); var w = swr.RemoveStopWords("the dog ate a bone"); }
static void Main(string[] args) { const string kText = "• Implement brand-new programs for Grants and Scholarships by working with Product Owner, BAs, and QAs teams"+ "• Rewrite the legacy Grants and Scholarships systems using new architecture with.Net Core, MVC, &Web API Core."+ " Follow Agile and Scrum Methodology with two - week sprint, grooming, tasks planning, and so on."+ "• Presentations / Knowledge Shares with Development Team for any new approaches and technologies."+ " Financial Institutions Department" + "• Enhance Request / Ticket Systems and Imaging Systems by adding new features and customizations."+ " • Refactor the entire of code based using Repository, Domain, Services, and Dependency Injections." + "• Fix and improve UI by using JavaScript, jQuery, and MVC View Razors."+ "• Modify and Create SQL Stored Procedures to Support Applications."+ "• Enhance Imaging Systems by adding Auto Email feature and customizing UI."+ "• Both Request and Imaging Systems’ new features were deployed to PROD server."+ "• Tools: Visual Studio 2019 / 2017, TFS 2013, and SQL Management Studio 2014."; Text.NGrams ngs = new Text.NGrams(); Console.WriteLine("N-Grams from 'raw' text..."); var ngrams = ngs.GenerateNGrams(kText); foreach (var nGram in ngrams) { Console.WriteLine(nGram.ToString()); } Console.WriteLine("========================================"); Console.WriteLine("N-Grams from 'normalize and stop words removed' text..."); var normalizer = new TextNormalizer(); var normalizedText = normalizer.NormalizeText(kText); var stopWordsRemover = new StopWordsRemover(); var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText); Console.WriteLine("Normalized and Stop words removed text:"); Console.WriteLine(stopWordsRemoved); Console.WriteLine("\n\n\n"); var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved); foreach (var nGram in ngrams2) { Console.WriteLine(nGram.ToString()); } PrintEnd(); }
// ReSharper disable once UnusedParameter.Local static void Main(string[] args) { const string kText = "this is a test. this is only a test. if this had been an actual program it would not have been so dumb"; NGrams ngs = new NGrams(3); Console.WriteLine("N-Grams from 'raw' text..."); var ngrams = ngs.GenerateNGramsStrings(kText); foreach (var nGram in ngrams) { Console.WriteLine(nGram.ToString()); } Console.WriteLine("========================================"); Console.WriteLine("N-Grams from 'normalize and stop words removed' text..."); var normalizer = new TextNormalizer(); var normalizedText = normalizer.NormalizeText(kText); var stopWordsRemover = new StopWordsRemover(); var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText); Console.WriteLine("Normalized and Stop words removed text:"); Console.WriteLine(stopWordsRemoved); Console.WriteLine("\n\n\n"); var normalizedWithoutStopWordsRemoved = normalizer.NormalizeText(kText); Console.WriteLine("Normalized and Stop words NOT removed text:"); Console.WriteLine(normalizedWithoutStopWordsRemoved); Console.WriteLine("\n\n\n"); var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved); Console.WriteLine("N-Grams from 'normalized' text..."); foreach (var nGram in ngrams2) { Console.WriteLine(nGram.ToString()); } PrintEnd(); }
private SimpleDataSet CreateIndexItemFromJob(Job job, string indexType) { if (job.DatePublished > DateTime.UtcNow) { LogHelper.Info <BaseJobsIndexer>($"Ignoring job '{job.Id}' because it's publish date {job.DatePublished.ToIso8601DateTime()} is in the future."); return(null); } LogHelper.Info <BaseJobsIndexer>($"Building Examine index item for job '{job.Id}'"); var salary = job.Salary.SalaryRange; var salaryWithStopWords = salary; if (!String.IsNullOrEmpty(salary)) { if (TagSanitiser != null) { salary = TagSanitiser.StripTags(salary); } if (StopWordsRemover != null) { salary = StopWordsRemover.Filter(salary); } } var simpleDataSet = new SimpleDataSet { NodeDefinition = new IndexedNode(), RowData = new Dictionary <string, string>() }; simpleDataSet.NodeDefinition.NodeId = job.Id; simpleDataSet.NodeDefinition.Type = indexType; simpleDataSet.RowData.Add("id", job.Id.ToString(CultureInfo.InvariantCulture)); simpleDataSet.RowData.Add("reference", job.Reference); simpleDataSet.RowData.Add("numberOfPositions", job.NumberOfPositions?.ToString(CultureInfo.CurrentCulture)); simpleDataSet.RowData.Add("title", StopWordsRemover != null ? StopWordsRemover.Filter(job.JobTitle) : job.JobTitle); simpleDataSet.RowData.Add("titleDisplay", job.JobTitle); simpleDataSet.RowData.Add("logoUrl", job.LogoUrl?.ToString()); simpleDataSet.RowData.Add("organisation", StopWordsRemover != null ? StopWordsRemover.Filter(job.Organisation) : job.Organisation); simpleDataSet.RowData.Add("organisationDisplay", job.Organisation); simpleDataSet.RowData.Add("salary", salary); simpleDataSet.RowData.Add("salaryDisplay", salaryWithStopWords); // so that it's not displayed with stop words removed simpleDataSet.RowData.Add("salaryRange", StopWordsRemover != null ? StopWordsRemover.Filter(job.Salary.SearchRange) : job.Salary.SearchRange); simpleDataSet.RowData.Add("salaryMin", job.Salary.MinimumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty); simpleDataSet.RowData.Add("salaryMax", job.Salary.MaximumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty); simpleDataSet.RowData.Add("salarySort", (job.Salary.MinimumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty) + " " + (job.Salary.MaximumSalary?.ToString("0000000.00").Replace(".", string.Empty) ?? String.Empty) + " " + (StopWordsRemover != null ? StopWordsRemover.Filter(job.Salary.SalaryRange) : job.Salary.SalaryRange)); simpleDataSet.RowData.Add("hourlyRate", job.Salary.MinimumHourlyRate?.ToString(CultureInfo.CurrentCulture)); simpleDataSet.RowData.Add("hoursPerWeek", job.WorkPattern.HoursPerWeek?.ToString(CultureInfo.CurrentCulture)); simpleDataSet.RowData.Add("jobType", StopWordsRemover != null ? StopWordsRemover.Filter(job.JobType) : job.JobType); simpleDataSet.RowData.Add("jobTypeDisplay", job.JobType); simpleDataSet.RowData.Add("contractType", StopWordsRemover != null ? StopWordsRemover.Filter(job.ContractType) : job.ContractType); simpleDataSet.RowData.Add("department", StopWordsRemover != null ? StopWordsRemover.Filter(job.Department) : job.Department); simpleDataSet.RowData.Add("departmentDisplay", job.Department); simpleDataSet.RowData.Add("datePublished", job.DatePublished.ToIso8601DateTime()); if (job.ClosingDate.HasValue) { simpleDataSet.RowData.Add("closingDate", job.ClosingDate.Value.ToIso8601DateTime()); simpleDataSet.RowData.Add("closingDateDisplay", job.ClosingDate.Value.ToIso8601DateTime()); } else { // Examine queries are simpler if there is always a closing date, so set a far-future closing date to represent never, // and don't have a version of the closing date for display. simpleDataSet.RowData.Add("closingDate", DateTime.MaxValue.ToIso8601DateTime()); } var workPatternList = string.Join(", ", job.WorkPattern.WorkPatterns.ToArray <string>()); simpleDataSet.RowData.Add("workPattern", workPatternList); var locationsList = string.Join(", ", job.Locations.ToArray <string>()); simpleDataSet.RowData.Add("location", StopWordsRemover != null ? StopWordsRemover.Filter(locationsList) : locationsList); simpleDataSet.RowData.Add("locationDisplay", locationsList); // because Somewhere-on-Sea needs to lose the "on" for searching but keep it for display if (job.AdvertHtml != null) { var fullText = job.AdvertHtml.ToHtmlString(); if (TagSanitiser != null) { fullText = TagSanitiser.StripTags(fullText); } // Append other fields as keywords, otherwise a search term that's a good match will not be found if it has terms from two fields, // eg Job Title (full time) const string space = " "; fullText = new StringBuilder(fullText) .Append(space).Append(job.Reference) .Append(space).Append(job.JobTitle) .Append(space).Append(job.Organisation) .Append(space).Append(job.Locations) .Append(space).Append(job.JobType) .Append(space).Append(job.ContractType) .Append(space).Append(job.Department) .Append(space).Append(job.WorkPattern.ToString()) .ToString(); simpleDataSet.RowData.Add("fullText", fullText); simpleDataSet.RowData.Add("fullHtml", job.AdvertHtml.ToHtmlString()); } if (job.AdditionalInformationHtml != null) { simpleDataSet.RowData.Add("additionalInfo", job.AdditionalInformationHtml.ToHtmlString()); } if (job.EqualOpportunitiesHtml != null) { simpleDataSet.RowData.Add("equalOpportunities", job.EqualOpportunitiesHtml.ToHtmlString()); } simpleDataSet.RowData.Add("applyUrl", job.ApplyUrl?.ToString()); return(simpleDataSet); }