public IActionResult SearchByName(string pokemon) { string poke = TextCleaner.NormalInput(pokemon); PokemonRoot p = new PokemonRoot(); if (poke == null) { TempData["error"] = "Please enter a valid entry"; return(RedirectToAction("Index")); } try { p = pk.GetPokemon(poke); } catch (Exception e) { TempData["error"] = "Please enter a valid entry"; return(RedirectToAction("Index")); } TempData.Remove("moveerror"); TempData.Remove("error"); return(View(p)); }
private Tweet[] FillUserTweet(UserTweet result, string content) { var matches = Regex.Matches(content, RegexContent, RegexOptions.Multiline | RegexOptions.IgnoreCase); List <Tweet> tweetList = new List <Tweet>(); try { foreach (Match match in matches) { Tweet tweet = new Tweet(); int comment; int.TryParse(match.Groups["Reply"].Value, out comment); int forward; int.TryParse(match.Groups["Forward"].Value, out forward); tweet.Comment = comment; tweet.Content = TextCleaner.FullClean(match.Groups["Content"].Value); tweet.Mid = match.Groups["Mid"].Value; tweet.Forward = forward; tweet.Source = match.Groups["Source"].Value; tweet.PubDate = DateTimeParser.Parser(match.Groups["PubDate"].Value) ?? DateTime.MinValue; tweet.Url = RegexParser.AbsoluteUrl(match.Groups["Url"].Value, result.Url, true); result.Tweets.Add(tweet); tweetList.Add(tweet); } } catch {} return(tweetList.ToArray()); }
private async Task PlainTextAndCopy() { string html = this.SourceValue; string text; #pragma warning disable CA1031 // Do not catch general exception types try { text = TextCleaner.HtmlToPlainText(html); } catch (Exception ex) { Logger.Log(LogLevel.Error, nameof(PlainTextAndCopy), "Error cleaning HTML to text:" + Environment.NewLine + html, ex); MessageBox.Show("There was an error getting text from the HTML", "error", MessageBoxButton.OK, MessageBoxImage.Error); return; } try { ClipboardHelper.CopyPlainTextToClipboard(text); } catch (Exception ex) { Logger.Log(LogLevel.Error, nameof(PlainTextAndCopy), "Error writing TEXT to clipboard", ex); MessageBox.Show("There was an error writing the TEXT to the clipboard", "error", MessageBoxButton.OK, MessageBoxImage.Error); return; } #pragma warning restore CA1031 // Do not catch general exception types this.SourceValue = text; await this.SetStatus("The plain TEXT is on the clipboard, use Ctrl-V to paste.").ConfigureAwait(false); }
private async Task ClearStylingAndCopy() { string html = this.SourceValue; #pragma warning disable CA1031 // Do not catch general exception types try { html = TextCleaner.ClearStylingFromHtml( html, CleanerSettings.Instance); } catch (Exception ex) { Logger.Log(LogLevel.Error, nameof(ClearStylingAndCopy), "Error cleaning HTML:" + Environment.NewLine + html, ex); MessageBox.Show("There was an error cleaning the HTML", "error", MessageBoxButton.OK, MessageBoxImage.Error); return; } try { ClipboardHelper.CopyToClipboard(html, html); this.SourceValue = html; Logger.Log(LogLevel.Debug, nameof(ClearStylingAndCopy), "Cleaned HTML and copied to clipboard"); } catch (Exception ex) { Logger.Log(LogLevel.Error, nameof(ClearStylingAndCopy), "Error writing HTML to clipboard", ex); MessageBox.Show("There was an error writing the cleand HTML to the clipboard", "error", MessageBoxButton.OK, MessageBoxImage.Error); return; } #pragma warning restore CA1031 // Do not catch general exception types await this.SetStatus("The cleaned HTML is on the clipboard, use Ctrl-V to paste.").ConfigureAwait(false); }
public string GetFileName(AuthorText authorText) { var parts = authorText.Link.Split("/".ToCharArray()); var path = parts[parts.Length - 1]; return(TextCleaner.MakeFileAcceptableName(path)); }
public void QuotesInBrackets_ShouldBeConverted() { var source = "<p>Something ('me') something</p>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.UpdateQuotes(doc, QuoteProcessing.ChangeToSmartQuotes)); Assert.AreEqual("<p>Something (‘me’) something</p>", html); }
public Sentiment GetTextSentiment <TEntity>(string text, bool isRetweet, int ngramCardinality, decimal smoothingFactor, bool isStemmed, IDictionary <String, String> dictionary, DbSet <TEntity> ngramDbSet, IOclumenContext oclumenContext, Dictionary <string, List <KeyValuePair <Sentiment, decimal> > > ngramDictionary = null) where TEntity : NgramBase { text = new TextCleaner(text).StripPunctuation().RemoveExcessSpaces().ToLower().ToString(); IList <string> ngrams = NgramGenerator.GenerateNgrams(text, ngramCardinality); if (isStemmed) { ngrams = Processor.StemNgram(ngrams, dictionary); } var ngramCounts = new List <IList <KeyValuePair <Sentiment, decimal> > >(ngrams.Count); IList <KeyValuePair <Sentiment, decimal> > classCounts = GetClassCount(isRetweet, ngramCardinality, smoothingFactor, ngramDbSet, oclumenContext); // get the raw counts for each of the ngrams foreach (string ngram in ngrams) { ngramCounts.Add(GetNgramCount(ngram, isRetweet, ngramCardinality, smoothingFactor, ngramDbSet, oclumenContext, ngramDictionary)); //Debug.WriteLine(ngram + " " + ngramCounts.Last().First(x => x.Key == Sentiment.Positive).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Neutral).Value + ", " + ngramCounts.Last().First(x => x.Key == Sentiment.Negative).Value); } int vocabularySize = GetVocabularySize(isRetweet, ngramCardinality, ngramDbSet, oclumenContext); // ok now let's get the probabilities, combining the individual ngram probabilities // witht he probability of a given sentiment class var sentimentProb = GetNgramSentimentProbabilities(vocabularySize, ngramCounts, classCounts); return(sentimentProb.Last().Key); }
public void RemoveOfficeMarkup() { var source = @" <p class=MsoNormal><b><span lang=EN-US style='mso-ansi-language:EN-US'>HU-HU</span><u5:p></u5:p></b><span lang=EN-US style='mso-ansi-language:EN-US'><o:p></o:p></span></p> <ul style='margin-top:0cm' type=disc> <li class=MsoListParagraph style='margin-left:0cm;mso-list:l0 level1 lfo1'><span lang=EN-US style='mso-fareast-font-family:""Times New Roman"";mso-ansi-language: EN-US'>text in header should be in Hungarian: Olvassa el a Wolters Kluwer legújabb Megfelelőségi szakértői betekintéseit – Cikk, whitepaper, kutatás, esettanulmány és podcast.<o:p></o:p></span><u5:p></u5:p></li> <li class=MsoListParagraph style='margin-left:0cm;mso-list:l0 level1 lfo1'><span lang=EN-US style='mso-fareast-font-family:""Times New Roman"";mso-ansi-language: EN-US'>Read More button – see row 107 for local translation <o:p></o:p></span><u5:p></u5:p></li> <li class=MsoListParagraph style='color:black;margin-left:0cm;mso-list:l0 level1 lfo1'><span lang=EN-US style='mso-fareast-font-family:""Times New Roman"";color:windowtext; mso-ansi-language:EN-US'>add dynanic card for expert insights </span><span lang=EN-US style='mso-fareast-font-family:""Times New Roman"";mso-ansi-language: EN-US'><o:p></o:p></span></li> </ul> <u5:p></u5:p><u5:p> "; var html = DocTester.ProcessSource(source, doc => TextCleaner.RemoveOfficeMarkup(doc)); Assert.IsFalse(html.Contains("<o:p>"), "Office markup should have been removed, like <o:p>"); Assert.IsFalse(html.Contains("<u5:p>"), "Office markup should have been removed, like <u5:p>"); }
public void ToSmartQuotes_ShouldChangeSimpleQuotes() { var source = "<a target=\"_blank\">“some remark” said the so-called \"chief.\"</a>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.UpdateQuotes(doc, QuoteProcessing.ChangeToSmartQuotes)); Assert.AreEqual("<a target=\"_blank\">“some remark” said the so-called “chief.”</a>", html); }
public void LinksToRemoteWithRel2_ShouldGetTargetAndNoOpener() { var source = "<a href=\"https://www.example.com\" rel=\"noreferrer\">link</a>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.AddBlankLinkTargets(doc, true)); Assert.AreEqual("<a href=\"https://www.example.com\" rel=\"noreferrer noopener\" target=\"_blank\">link</a>", html); }
public void LinksToLocal_ShouldNotGetTargetOrOpener() { var source = "<a href=\"/default.html\">link</a>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.AddBlankLinkTargets(doc, true)); Assert.AreEqual("<a href=\"/default.html\">link</a>", html); }
public void NoChange_ShouldNotChange() { var source = "<a target=\"_blank\">“some remark” said the so-called \"chief\"</a>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.UpdateQuotes(doc, QuoteProcessing.NoChange)); Assert.AreEqual(source, html); }
public void LinksToRemoteWithTarget_ShouldNotChangeTargetButAddOpener() { var source = "<a href=\"https://www.example.com\" target=\"_self\">link</a>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.AddBlankLinkTargets(doc, true)); Assert.AreEqual("<a href=\"https://www.example.com\" target=\"_self\" rel=\"noopener\">link</a>", html); }
public void LinksToRemote_ShouldGetTarget() { var source = "<a href=\"https://www.example.com\">link</a>"; var html = DocTester.ProcessSource(source, doc => TextCleaner.AddBlankLinkTargets(doc, false)); Assert.AreEqual("<a href=\"https://www.example.com\" target=\"_blank\">link</a>", html); }
/// <summary> /// Processes the source using the supplied processor and returns the result. /// </summary> /// <param name="source">The (html) source.</param> /// <param name="processor">The processor method.</param> /// <returns>The resulting HTML.</returns> public static string ProcessSource(string source, Action <HtmlDocument> processor) { var doc = TextCleaner.CreateHtmlDocument(source); processor(doc); var html = TextCleaner.GetHtmlSource(doc, false); return(html); }
public void Clean_Text_Fragment() { var cleanedText = TextCleaner.Clean(TestData.TextWithOddCharacters); Assert.DoesNotContain(Environment.NewLine, cleanedText); Assert.DoesNotContain(" ", cleanedText); Assert.DoesNotContain("\t", cleanedText); Assert.False(cleanedText.StartsWith(" ")); }
private void CrawlDailyReport(Worksheet dailyWorksheet, Workbook dailybook, ref int dailyStartRow, string categoryName, string[] categoryUrls) { bool isFirst = true; foreach (string url in categoryUrls) { var dailycontent = WebRequestProcessor.DownloadHTTPString(url); Thread.Sleep(2000); var dailyMatches = Regex.Matches(dailycontent, baiduRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); foreach (Match dailyMatch in dailyMatches) { if (!dailyMatch.Groups["PubDate"].Value.Contains("前")) { continue; } if (isFirst) { dailyWorksheet.Cells[dailyStartRow, 2].PutValue(categoryName); isFirst = false; } var resultUrl = dailyMatch.Groups["Url"].Value; try { Uri uri = new Uri(resultUrl); var domain = GetUrlDomain(uri.Host); //匹配媒体名 dailyWorksheet.Cells[dailyStartRow, 1].PutValue(domain); } catch (Exception) { } var title = TextCleaner.FullClean(dailyMatch.Groups["Title"].Value) + Environment.NewLine + TextCleaner.FullClean(dailyMatch.Groups["Text"].Value); var colorstyle = dailyWorksheet.Cells[dailyStartRow, 6].GetDisplayStyle(); colorstyle.Font.Color = Color.Blue; var currentExcelRow = dailyStartRow + 1; dailyWorksheet.Cells[dailyStartRow, 0].PutValue(resultUrl); dailyWorksheet.Cells[dailyStartRow, 5].Formula = "=VLOOKUP(B" + currentExcelRow + ",Sheet2!A:B,2,FALSE)"; dailyWorksheet.Cells[dailyStartRow, 6].SetStyle(colorstyle); dailyWorksheet.Cells[dailyStartRow, 6].PutValue(title); dailyWorksheet.Hyperlinks.Add(dailyStartRow, 6, 1, 1, resultUrl); dailyWorksheet.Cells[dailyStartRow, 7].PutValue(DateTime.Now.ToString("yyyy-MM-dd")); dailyWorksheet.Cells[dailyStartRow, 8].PutValue("负面舆情"); dailyStartRow++; } } dailybook.Save(@"D:\dailyreport\日报.xlsx"); }
/// <summary> /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText /// </summary> /// <param name="BaseNode">一个Item的根节点</param> /// <param name="RelXPath">相对XPath路径</param> /// <param name="CleanConnectionMark">是否清洗文本</param> /// <returns></returns> internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true) { if (BaseNode == null) { return(null); } if (string.IsNullOrWhiteSpace(RelXPath)) { if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false)); } } string innerTextValue = ""; try { HtmlNodeNavigator navigator = (HtmlNodeNavigator)BaseNode.CreateNavigator(); var node = navigator.SelectSingleNode(RelXPath); innerTextValue = node.Value; } catch (Exception ex) { } if (string.IsNullOrWhiteSpace(innerTextValue)) { IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath); if (MatchNodes != null) { MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n))); } if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() == 0)) { return(null); } innerTextValue = XPathUtility.InnerTextNonDescendants(MatchNodes.First()); } if (CleanConnectionMark) { return(TextCleaner.FullClean(innerTextValue)); } else { return(TextCleaner.FullClean(innerTextValue, true, true, true, false, true, false)); } }
IEnumerator GetQuestion() { WaitForSeconds wait = new WaitForSeconds(5); yield return(wait); Item item = new Item(); while (true) { QuestionStructure question = board.GetQuestion(); awnsers.Clear(); tags.Clear(); item = new Item(); if (question != null) { question.pergunta_texto = question.pergunta_texto.ToLower(); question.pergunta_texto = TextCleaner.CleanText(question.pergunta_texto); Debug.Log("Respondendo: " + question.pergunta_texto); if (item.LoadItem(question.produto_nome)) { Debug.Log("Verficando Tags"); foreach (string key in item.intents.Keys) { if (question.pergunta_texto.Contains(key)) { tags.Add(key); Debug.Log("Tag Encontrada: " + key); awnsers.Add(item.intents[key]); } } if (awnsers.Count > 0) { CreateAwnser(); } else { board.Next(); } } else { Debug.Log("Item nao cadastrado"); } } yield return(wait); } }
public void TestToLower() { const string firstString = "Hello World"; var textCleaner = new TextCleaner(firstString); Assert.AreEqual(textCleaner.ToLower().ToString(), firstString.ToLower()); Assert.AreEqual(new TextCleaner("").ToLower().ToString(), ""); Assert.AreEqual(new TextCleaner("hello").ToLower().ToString(), "hello"); }
public void TextCleanerTests_IndexKey_Indexes() { Assert.Null(TextCleaner.Clean(null)); Assert.Equal(TextCleaner.Clean(" "), " "); var test = ((char)147).ToString() + ((char)148).ToString() + ((char)8220).ToString() + ((char)8221).ToString() + " " + ((char)133).ToString() + ((char)8230).ToString() + " " + ((char)146).ToString() + ((char)8217).ToString() + ((char)145).ToString() + ((char)8216).ToString() + " " + ((char)8211).ToString(); var result = TextCleaner.Clean(test); Assert.Equal(result, "\"\"\"\" ...... '''' -"); }
public IActionResult SearchByType(string type, [FromQuery] int pageNumber = 1, [FromQuery] int pageSize = 10) { string t = TextCleaner.NormalInput(type); TempData.Remove("error"); TempData.Remove("moveerror"); TempData["typeName"] = t; List <Pokemon> pokemon = pk.GetType(t); List <Pokemon> pagedPokemon = pokemon.Skip((pageNumber - 1) * pageSize).Take(pageSize).ToList(); TempData["pageNumber"] = pageNumber; TempData["pageSize"] = pageSize; return(View(pagedPokemon)); }
public void RemoveCommentsAndWhitespace_WorksCorrectly() { var lines = new string[] { " foo ", "", "// some comment", "bar // another comment", "" }; var result = new TextCleaner().RemoveCommentsAndWhitespace(lines); Assert.Equal(2, result.Length); Assert.Equal("foo", result[0]); Assert.Equal("bar", result[1]); }
private async Task ClearStylingAndCopyAsync() { Logger.Log(LogLevel.Debug, nameof(MainPage), "Start clearing styling"); string html = this.SourceValue; Logger.Log(LogLevel.Debug, nameof(MainPage), $"HTML size before processing: {html?.Length ?? 0}"); #pragma warning disable CA1031 // Do not catch general exception types try { html = TextCleaner.ClearStylingFromHtml( html, CleanerSettings.Instance); } catch (Exception ex) { Logger.Log(LogLevel.Error, nameof(ClearStylingAndCopyAsync), "Error cleaning HTML:" + Environment.NewLine + html, ex); await this.SetStatusAsync("There was an error cleaning the HTML"); return; } Logger.Log(LogLevel.Debug, nameof(MainPage), $"HTML size after processing: {html?.Length ?? 0}"); try { ClipboardHelper.CopyToClipboard(html, html); this.SourceValue = html; Logger.Log(LogLevel.Debug, nameof(ClearStylingAndCopyAsync), "Cleaned HTML and copied to clipboard"); } catch (Exception ex) { Logger.Log(LogLevel.Error, nameof(ClearStylingAndCopyAsync), "Error writing HTML to clipboard", ex); await this.SetStatusAsync("There was an error writing the cleand HTML to the clipboard"); return; } #pragma warning restore CA1031 // Do not catch general exception types await this.SetStatusAsync("The cleaned HTML is on the clipboard, use Ctrl-V to paste.").ConfigureAwait(false); Logger.Log(LogLevel.Debug, nameof(MainPage), "Done clearing styling"); }
/// <summary> /// Match2s the item. /// </summary> /// <param name="m">M.</param> /// <param name="Item">Item.</param> /// <param name="BaseUrl">Base URL.</param> /// <param name="ItemUrlCaseSensitive">If set to <c>true</c> item URL case sensitive.</param> public static void Match2Item(Match m, ref Article Item, string BaseUrl, bool ItemUrlCaseSensitive = false) { //url Item.Url = new Uri(new Uri(BaseUrl), RegexUtility.TryGetString(m, "Url", Item.Url, false)).AbsoluteUri; //title Item.Title = RegexUtility.TryGetString(m, "Title", Item.Title); //降低Clean级别 if (string.IsNullOrEmpty(Item.Title)) { Item.Title = HTMLCleaner.CleanHTML(Item.Title, true); } //text Item.HtmlContent = RegexUtility.TryGetString(m, "Text", Item.HtmlContent, false); //Author Info Item.Author = RegexUtility.TryGetString(m, "AuthorName", Item.Author); Item.Source = RegexUtility.TryGetString(m, "Source", Item.Source); if (!String.IsNullOrWhiteSpace(Item.Source)) { Item.Source = TextCleaner.FullClean(Item.Source); } //Media Info Item.MediaName = RegexUtility.TryGetString(m, "MediaName", Item.MediaName); //time if (m.Groups["PubDate"].Success) { Item.PubDate = DateTimeParser.Parser(HTMLCleaner.CleanHTML(m.Groups["PubDate"].Value, true)); } if (Item.PubDate <= DateTime.MinValue) { Item.PubDate = DateTime.Now; } Match2ItemCount(m, Item.ViewDataList); }
public void Register(string id, string description) { description = description.Replace(" : ", ":"); string[] content = description.Split(':'); Item item = new Item(); item.ID = id; for (int i = 0; i < content.Length - 1; i++) { if (i % 2 == 0) { content[i] = TextCleaner.CleanText(content[i]); content[i] = TextCleaner.RemovePonctuation(content[i]); item.AddIntent(content[i], content[i + 1]); } } item.SaveItem(); }
public IActionResult SearchByMove(string move, [FromQuery] int pageNumber = 1, [FromQuery] int pageSize = 10) { //Normalizes search string string search = TextCleaner.NormalInput(move); TempData["moveName"] = search; if (search == null) { TempData["error"] = "Please enter a valid entry"; return(RedirectToAction("Index")); } //Deserializes move object MoveRoot m = new MoveRoot(); try { m = pk.GetMove(search); } catch (Exception e) { TempData["moveerror"] = "Please enter a valid entry"; return(RedirectToAction("Index")); } List <Learned_By_Pokemon> pokemonByUrl = new List <Learned_By_Pokemon>(); pokemonByUrl = m.learned_by_pokemon.Skip((pageNumber - 1) * pageSize).Take(pageSize).ToList(); TempData["pageNumber"] = pageNumber; TempData["pageSize"] = pageSize; TempData.Remove("error"); TempData.Remove("moveerror"); //Passing the list into the view return(View(pokemonByUrl)); }
/// <summary> /// 根据相对路径XPath从单一Item的BaseNode节点提取某一个字段的Node的InnerText /// </summary> /// <param name="BaseNode">一个Item的根节点</param> /// <param name="RelXPath">相对XPath路径</param> /// <param name="CleanConnectionMark">是否清洗文本</param> /// <returns></returns> internal static string ExtractInnerTextFromBaseNode(HtmlNode BaseNode, string RelXPath, int postion, bool CleanConnectionMark = true) { if (BaseNode == null) { return(null); } if (string.IsNullOrWhiteSpace(RelXPath) && postion == 0) { if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(BaseNode), true, true, true, false, true, false)); } } IEnumerable <HtmlNode> MatchNodes = BaseNode.SelectNodes(RelXPath); if (MatchNodes != null) { MatchNodes = MatchNodes.Where(n => !string.IsNullOrEmpty(XPathUtility.InnerTextNonDescendants(n))); } if (!string.IsNullOrWhiteSpace(RelXPath) && (MatchNodes == null || MatchNodes.Count() <= postion)) { return(null); } if (CleanConnectionMark) { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)))); } else { return(TextCleaner.FullClean(XPathUtility.InnerTextNonDescendants(MatchNodes.ElementAt(postion)), true, true, true, false, true, false)); } }
/// <summary> /// 验证标题是否合法 /// </summary> /// <param name="Title"></param> /// <returns></returns> public bool ValidateTitle(string Title) { if (string.IsNullOrWhiteSpace(Title)) { return(false); } string CleanTitle = TextCleaner.FullClean(Title); switch (Language) { default: case Enums.Language.CHINESE: //中文:标题长度够长,且数字字符占比不超 return((MinLenTitle <= 0 || CleanTitle.Length >= MinLenTitle) && (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle))); case Enums.Language.ENGLISH: //英文:标题单词够多,且数字字符占比不超 return(MinWordCountTitle <= 0 || CleanTitle.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).Length > MinWordCountTitle && (MaxRateTitleDigits >= 1 || CleanTitle.Length * MaxRateTitleDigits > TextCleaner.CountDigitChars(CleanTitle))); } }
private NGramItem GetNGrams(string line) { var segments = TextCleaner.CleanSplit(" ", line); var frequency = long.Parse(segments[0]); if (English.IllegalTokens.Any(line.Contains)) { return(null); } if (frequency < MinimumFrequency) { return(null); } var words = segments.Skip(1).ToArray(); return(new NGramItem { Text = string.Join(" ", words), Frequency = frequency, Words = words }); }