/// <summary> /// Normalizes the specified text. /// </summary> /// <param name="text">The text.</param> /// <returns>The normalized text.</returns> public string Normalize(string text) { foreach (var TextNormalizer in TextNormalizers.OrderBy(x => x.Order)) { text = TextNormalizer.Normalize(text); } return(text); }
public void Execute(TransactionImportContext context) { var normalizer = new TextNormalizer(); foreach (var item in context.Transactions) { item.Keywords = normalizer.Normalize(item.Keywords); } }
private string CleanName(string name) { name = name.ToLower(); name = name.Replace(" en adelante ", ""); name = name.Replace(" entre ", ""); if (name.Contains(" en santiago ")) { name = name.Substring(0, name.IndexOf(" en santiago ")); } return(TextNormalizer.GetInstance().FirstLetterUpper(name.Trim())); }
public static List <float> GetCDSSM(string doc, string docModelFile, string stopWordsFile) { List <float> CDSSMFeature = new List <float>(); DeepSemanticModel dm = GetDocModel(docModelFile, stopWordsFile); string normalizedDoc = TextNormalizer.Normalizer(TextNormalizer.StopWordsFilter(doc.Replace('-', ' ').Replace('\'', ' ').ToLowerInvariant().Trim())); if (!docModel.SemanticFeatureEmbedding(normalizedDoc, out CDSSMFeature)) { Console.WriteLine("Error: document {0} embedding failed ...", doc); return(null); } return(CDSSMFeature); }
public List <string> CleanNotaryNames(List <string> names) { List <string> finalNames = new List <string>(); for (int i = 0; i < names.Count; i++) { if (IsValidNotaryName(names[i])) { finalNames.Add(TextNormalizer.GetInstance().FirstLetterUpper(CleanNotaryName(names[i]))); } } return(finalNames); }
private static string ToSafeText(object value) { if (value == null) { return(string.Empty); } var text = value.ToString(); if (string.IsNullOrWhiteSpace(text)) { return(string.Empty); } return(TextNormalizer.Normalize(text.Trim())); }
static void Main(string[] args) { const string kText = "• Implement brand-new programs for Grants and Scholarships by working with Product Owner, BAs, and QAs teams"+ "• Rewrite the legacy Grants and Scholarships systems using new architecture with.Net Core, MVC, &Web API Core."+ " Follow Agile and Scrum Methodology with two - week sprint, grooming, tasks planning, and so on."+ "• Presentations / Knowledge Shares with Development Team for any new approaches and technologies."+ " Financial Institutions Department" + "• Enhance Request / Ticket Systems and Imaging Systems by adding new features and customizations."+ " • Refactor the entire of code based using Repository, Domain, Services, and Dependency Injections." + "• Fix and improve UI by using JavaScript, jQuery, and MVC View Razors."+ "• Modify and Create SQL Stored Procedures to Support Applications."+ "• Enhance Imaging Systems by adding Auto Email feature and customizing UI."+ "• Both Request and Imaging Systems’ new features were deployed to PROD server."+ "• Tools: Visual Studio 2019 / 2017, TFS 2013, and SQL Management Studio 2014."; Text.NGrams ngs = new Text.NGrams(); Console.WriteLine("N-Grams from 'raw' text..."); var ngrams = ngs.GenerateNGrams(kText); foreach (var nGram in ngrams) { Console.WriteLine(nGram.ToString()); } Console.WriteLine("========================================"); Console.WriteLine("N-Grams from 'normalize and stop words removed' text..."); var normalizer = new TextNormalizer(); var normalizedText = normalizer.NormalizeText(kText); var stopWordsRemover = new StopWordsRemover(); var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText); Console.WriteLine("Normalized and Stop words removed text:"); Console.WriteLine(stopWordsRemoved); Console.WriteLine("\n\n\n"); var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved); foreach (var nGram in ngrams2) { Console.WriteLine(nGram.ToString()); } PrintEnd(); }
private static DeepSemanticModel GetDocModel(string docModelFile, string stopWordsFile) { if (docModel == null) { docModel = new DeepSemanticModel(); if (!docModel.ModelLoader(docModelFile)) { Console.WriteLine("Error: CDSSM document-side model load failed ..."); docModel = null; return(docModel); } if (!TextNormalizer.StopWordLoader(stopWordsFile)) { Console.WriteLine("Error: stop words list load failed ..."); } } return(docModel); }
// ReSharper disable once UnusedParameter.Local static void Main(string[] args) { const string kText = "this is a test. this is only a test. if this had been an actual program it would not have been so dumb"; NGrams ngs = new NGrams(3); Console.WriteLine("N-Grams from 'raw' text..."); var ngrams = ngs.GenerateNGramsStrings(kText); foreach (var nGram in ngrams) { Console.WriteLine(nGram.ToString()); } Console.WriteLine("========================================"); Console.WriteLine("N-Grams from 'normalize and stop words removed' text..."); var normalizer = new TextNormalizer(); var normalizedText = normalizer.NormalizeText(kText); var stopWordsRemover = new StopWordsRemover(); var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText); Console.WriteLine("Normalized and Stop words removed text:"); Console.WriteLine(stopWordsRemoved); Console.WriteLine("\n\n\n"); var normalizedWithoutStopWordsRemoved = normalizer.NormalizeText(kText); Console.WriteLine("Normalized and Stop words NOT removed text:"); Console.WriteLine(normalizedWithoutStopWordsRemoved); Console.WriteLine("\n\n\n"); var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved); Console.WriteLine("N-Grams from 'normalized' text..."); foreach (var nGram in ngrams2) { Console.WriteLine(nGram.ToString()); } PrintEnd(); }
private int?ParseInt(object value) { if (!DBNull.Value.Equals(value)) { int returnValue = 0; if (value is decimal) { returnValue = (int)value; } if (value is double) { returnValue = Convert.ToInt32(value); } if (value is float) { returnValue = Convert.ToInt32(value); } if (value is int) { returnValue = Convert.ToInt32(value); } if (value is string) { if (int.TryParse(TextNormalizer.ToSafeText(value), out returnValue) == false) { return(null); } } return(returnValue); } return(null); }
public void TextNormalizationAndStopwordRemoverWorkout() { string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); var data = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var invalidData = TextLoader.CreateReader(Env, ctx => ( label: ctx.LoadBool(0), text: ctx.LoadFloat(1)), hasHeader: true) .Read(new MultiFileSource(sentimentDataPath)); var est = new TextNormalizer(Env, "text") .Append(new WordTokenizer(Env, "text", "words")) .Append(new StopwordRemover(Env, "words", "words_without_stopwords")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); using (var ch = Env.Start("save")) { var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords"); using (var fs = File.Create(outputPath)) DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); } CheckEquality("Text", "words_without_stopwords.tsv"); Done(); }
private PatientInfo ParsePatientInfo(DataRow row, int rowIndex) { bool hasAtLeastOneFieldsFilled = false; for (int i = 0; i <= 6; i++) { if (!DBNull.Value.Equals(row[i])) { hasAtLeastOneFieldsFilled = true; } } if (!hasAtLeastOneFieldsFilled) { return(null); } var patientNumber = ParseInt(row[0]) ?? rowIndex; var gender = GenderParser.Parse(TextNormalizer.ToSafeText(row[1])); var age = AgeParser.Parse(TextNormalizer.ToSafeText(row[2])); var domicile = TextNormalizer.ToSafeText(row[3]); var infectionContact = TextNormalizer.ToSafeText(row[4]); var confirmedOn = DateParser.Parse(TextNormalizer.ToSafeText(row[5])); return(new PatientInfo { PatientNumber = patientNumber, Gender = gender, Domicile = domicile.ToUpper(), InfectionContact = infectionContact, InfectionSourceType = InfectionSourceParser.Parse(infectionContact), Age = age, ConfirmedOn = confirmedOn }); }
/// <summary> /// Get tokens from a string. /// </summary> /// <param name="data">String.</param> /// <param name="options">Text parse options.</param> /// <returns>List of tokens.</returns> public static List <Token> GetTokens(string data, TextParseOptions options) { if (options == null) { throw new ArgumentNullException(nameof(options)); } Dictionary <string, Token> dict = new Dictionary <string, Token>(); List <Token> ret = new List <Token>(); List <string> tokenList = new List <string>(); tokenList = new List <string>(data.Split(options.SplitCharacters, StringSplitOptions.RemoveEmptyEntries)); if (tokenList != null && tokenList.Count > 0) { for (int i = 0; i < tokenList.Count; i++) { if (String.IsNullOrEmpty(tokenList[i])) { continue; } string token = tokenList[i]; if (options.TokenManipulation.SetLowerCase) { token = token.ToLower(); } if (options.TokenManipulation.ReduceWhitespace) { token = TextNormalizer.ReduceWhitespace(token); } if (options.TokenManipulation.RemovePunctuation) { token = TextNormalizer.RemovePunctuation(options.PunctuationCharacters, token); } if (options.TokenManipulation.RemoveNumbers) { token = TextNormalizer.RemoveNumbers(token); } if (options.TokenManipulation.RemoveStopWords) { token = TextNormalizer.RemoveStopWords(options.StopWords, token); } if (token.Length < options.TokenLength.Min) { continue; } if (token.Length > options.TokenLength.Max) { continue; } if (options.StopWords.Contains(token)) { continue; } Token t = new Token(); t.Value = token; t.Count = 1; t.Positions.Add(i); if (dict.ContainsKey(t.Value)) { Token orig = dict[t.Value]; Token replace = new Token(); replace.Value = orig.Value; replace.Count = orig.Count + 1; replace.Positions = new List <long>(); if (t.Positions != null && t.Positions.Count > 0) { replace.Positions.Add(i); replace.Positions.AddRange(orig.Positions); } dict.Remove(t.Value); dict.Add(replace.Value, replace); } else { dict.Add(t.Value, t); } } } if (dict != null && dict.Count > 0) { ret = dict.Values.ToList().OrderByDescending(u => u.Count).ToList(); } return(ret); }
/* * @param0 documentFile input * @param1 documentVectorFile output * @param2 * @param3 * @param4 */ public static void CalculateTweetsCDSSM(string documentFile, string documentVectorFile, string queryModelFile, string docModelFile, string stopWordsFile) { /* * string documentFile = //args[0]; * @"D:\News_Team\Query-author-in-Twitter\recent-filtered-tweets.tsv"; * string documentVectorFile = //args[1]; * @"D:\News_Team\Query-author-in-Twitter\recent-filtered-CDSSM-tweets.tsv"; * string queryModelFile = //args[2]; * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\Model\QuerySideCDSSM.txt"; * string docModelFile = //args[3]; * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\Model\DocSideCDSSM.txt"; * string stopWordsFile = //args[4]; * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\StopWords.txt"; */ DeepSemanticModel docModel = new DeepSemanticModel(); if (!docModel.ModelLoader(docModelFile)) { Console.WriteLine("Error: CDSSM document-side model load failed ..."); return; } if (!TextNormalizer.StopWordLoader(stopWordsFile)) { Console.WriteLine("Error: stop words list load failed ..."); return; } StreamWriter sw = new StreamWriter(documentVectorFile); //sw.AutoFlush = true; int count = 0, preSize = -1; using (StreamReader sr = new StreamReader(documentFile)) { string items; while (null != (items = sr.ReadLine())) { string[] cols = items.Split('\t'); if (cols.Length < 8) { continue; } string doc = cols[3]; List <float> CDSSMFeature = new List <float>(); string normalizedDoc = TextNormalizer.Normalizer(TextNormalizer.StopWordsFilter(doc.Replace('-', ' ').Replace('\'', ' ').ToLowerInvariant().Trim())); if (!docModel.SemanticFeatureEmbedding(normalizedDoc, out CDSSMFeature)) { Console.WriteLine("Error: document {0} embedding failed ...", doc); continue; } else { sw.WriteLine("{0}\t{1}", items, string.Join(",", CDSSMFeature)); if (count % 100 == 0) { Console.WriteLine("{0}", count); } if (preSize != -1 && preSize != CDSSMFeature.Count) { Console.WriteLine(preSize + " " + CDSSMFeature.Count); } preSize = CDSSMFeature.Count; } count++; } sr.Close(); } Console.WriteLine("CDSSM count {0}", preSize); sw.Close(); }