Beispiel #1
0
 /// <summary>
 /// Normalizes the specified text.
 /// </summary>
 /// <param name="text">The text.</param>
 /// <returns>The normalized text.</returns>
 public string Normalize(string text)
 {
     foreach (var TextNormalizer in TextNormalizers.OrderBy(x => x.Order))
     {
         text = TextNormalizer.Normalize(text);
     }
     return(text);
 }
Beispiel #2
0
        public void Execute(TransactionImportContext context)
        {
            var normalizer = new TextNormalizer();

            foreach (var item in context.Transactions)
            {
                item.Keywords = normalizer.Normalize(item.Keywords);
            }
        }
Beispiel #3
0
        private string CleanName(string name)
        {
            name = name.ToLower();
            name = name.Replace(" en adelante ", "");
            name = name.Replace(" entre ", "");

            if (name.Contains(" en santiago "))
            {
                name = name.Substring(0, name.IndexOf(" en santiago "));
            }

            return(TextNormalizer.GetInstance().FirstLetterUpper(name.Trim()));
        }
        public static List <float> GetCDSSM(string doc, string docModelFile, string stopWordsFile)
        {
            List <float>      CDSSMFeature  = new List <float>();
            DeepSemanticModel dm            = GetDocModel(docModelFile, stopWordsFile);
            string            normalizedDoc = TextNormalizer.Normalizer(TextNormalizer.StopWordsFilter(doc.Replace('-', ' ').Replace('\'', ' ').ToLowerInvariant().Trim()));

            if (!docModel.SemanticFeatureEmbedding(normalizedDoc, out CDSSMFeature))
            {
                Console.WriteLine("Error: document {0} embedding failed ...", doc);
                return(null);
            }
            return(CDSSMFeature);
        }
Beispiel #5
0
        public List <string> CleanNotaryNames(List <string> names)
        {
            List <string> finalNames = new List <string>();

            for (int i = 0; i < names.Count; i++)
            {
                if (IsValidNotaryName(names[i]))
                {
                    finalNames.Add(TextNormalizer.GetInstance().FirstLetterUpper(CleanNotaryName(names[i])));
                }
            }

            return(finalNames);
        }
        private static string ToSafeText(object value)
        {
            if (value == null)
            {
                return(string.Empty);
            }

            var text = value.ToString();

            if (string.IsNullOrWhiteSpace(text))
            {
                return(string.Empty);
            }

            return(TextNormalizer.Normalize(text.Trim()));
        }
Beispiel #7
0
        static void Main(string[] args)
        {
            const string kText =
                "•	Implement brand-new programs for Grants and Scholarships by working with Product Owner, BAs, and QAs teams"+
                "•	Rewrite the legacy Grants and Scholarships systems using new architecture with.Net Core, MVC, &Web API Core."+
                "	Follow Agile and Scrum Methodology with two - week sprint, grooming, tasks planning, and so on."+
                "•	Presentations / Knowledge Shares with Development Team for any new approaches and technologies."+
                " Financial Institutions Department" +
                "•	Enhance Request / Ticket Systems and Imaging Systems by adding new features and customizations."+
                " •	Refactor the entire of code based using Repository, Domain, Services, and Dependency Injections." +
                "•	Fix and improve UI by using JavaScript, jQuery, and MVC View Razors."+
                "•	Modify and Create SQL Stored Procedures to Support Applications."+
                "•	Enhance Imaging Systems by adding Auto Email feature and customizing UI."+
                "•	Both Request and Imaging Systems’ new features were deployed to PROD server."+
                "•	Tools: Visual Studio 2019 / 2017, TFS 2013, and SQL Management Studio 2014.";

            Text.NGrams ngs = new Text.NGrams();

            Console.WriteLine("N-Grams from 'raw' text...");
            var ngrams = ngs.GenerateNGrams(kText);

            foreach (var nGram in ngrams)
            {
                Console.WriteLine(nGram.ToString());
            }

            Console.WriteLine("========================================");

            Console.WriteLine("N-Grams from 'normalize and stop words removed' text...");
            var normalizer       = new TextNormalizer();
            var normalizedText   = normalizer.NormalizeText(kText);
            var stopWordsRemover = new StopWordsRemover();
            var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText);

            Console.WriteLine("Normalized and Stop words removed text:");
            Console.WriteLine(stopWordsRemoved);
            Console.WriteLine("\n\n\n");
            var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved);

            foreach (var nGram in ngrams2)
            {
                Console.WriteLine(nGram.ToString());
            }

            PrintEnd();
        }
 private static DeepSemanticModel GetDocModel(string docModelFile, string stopWordsFile)
 {
     if (docModel == null)
     {
         docModel = new DeepSemanticModel();
         if (!docModel.ModelLoader(docModelFile))
         {
             Console.WriteLine("Error: CDSSM document-side model load failed ...");
             docModel = null;
             return(docModel);
         }
         if (!TextNormalizer.StopWordLoader(stopWordsFile))
         {
             Console.WriteLine("Error: stop words list load failed ...");
         }
     }
     return(docModel);
 }
Beispiel #9
0
        // ReSharper disable once UnusedParameter.Local
        static void Main(string[] args)
        {
            const string kText =
                "this is a test. this is only a test.  if this had been an actual program it would not have been so dumb";
            NGrams ngs = new NGrams(3);

            Console.WriteLine("N-Grams from 'raw' text...");
            var ngrams = ngs.GenerateNGramsStrings(kText);

            foreach (var nGram in ngrams)
            {
                Console.WriteLine(nGram.ToString());
            }

            Console.WriteLine("========================================");

            Console.WriteLine("N-Grams from 'normalize and stop words removed' text...");
            var normalizer       = new TextNormalizer();
            var normalizedText   = normalizer.NormalizeText(kText);
            var stopWordsRemover = new StopWordsRemover();
            var stopWordsRemoved = stopWordsRemover.RemoveStopWords(normalizedText);

            Console.WriteLine("Normalized and Stop words removed text:");
            Console.WriteLine(stopWordsRemoved);
            Console.WriteLine("\n\n\n");

            var normalizedWithoutStopWordsRemoved = normalizer.NormalizeText(kText);

            Console.WriteLine("Normalized and Stop words NOT removed text:");
            Console.WriteLine(normalizedWithoutStopWordsRemoved);
            Console.WriteLine("\n\n\n");

            var ngrams2 = ngs.GenerateNGrams(stopWordsRemoved);

            Console.WriteLine("N-Grams from 'normalized' text...");
            foreach (var nGram in ngrams2)
            {
                Console.WriteLine(nGram.ToString());
            }

            PrintEnd();
        }
Beispiel #10
0
        private int?ParseInt(object value)
        {
            if (!DBNull.Value.Equals(value))
            {
                int returnValue = 0;

                if (value is decimal)
                {
                    returnValue = (int)value;
                }

                if (value is double)
                {
                    returnValue = Convert.ToInt32(value);
                }

                if (value is float)
                {
                    returnValue = Convert.ToInt32(value);
                }

                if (value is int)
                {
                    returnValue = Convert.ToInt32(value);
                }

                if (value is string)
                {
                    if (int.TryParse(TextNormalizer.ToSafeText(value), out returnValue) == false)
                    {
                        return(null);
                    }
                }

                return(returnValue);
            }

            return(null);
        }
        public void TextNormalizationAndStopwordRemoverWorkout()
        {
            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
            var    data = TextLoader.CreateReader(Env, ctx => (
                                                      label: ctx.LoadBool(0),
                                                      text: ctx.LoadText(1)), hasHeader: true)
                          .Read(new MultiFileSource(sentimentDataPath));

            var invalidData = TextLoader.CreateReader(Env, ctx => (
                                                          label: ctx.LoadBool(0),
                                                          text: ctx.LoadFloat(1)), hasHeader: true)
                              .Read(new MultiFileSource(sentimentDataPath));

            var est = new TextNormalizer(Env, "text")
                      .Append(new WordTokenizer(Env, "text", "words"))
                      .Append(new StopwordRemover(Env, "words", "words_without_stopwords"));

            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");

            using (var ch = Env.Start("save"))
            {
                var saver = new TextSaver(Env, new TextSaver.Arguments {
                    Silent = true
                });
                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
                savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

                using (var fs = File.Create(outputPath))
                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
            }

            CheckEquality("Text", "words_without_stopwords.tsv");
            Done();
        }
Beispiel #12
0
        private PatientInfo ParsePatientInfo(DataRow row, int rowIndex)
        {
            bool hasAtLeastOneFieldsFilled = false;

            for (int i = 0; i <= 6; i++)
            {
                if (!DBNull.Value.Equals(row[i]))
                {
                    hasAtLeastOneFieldsFilled = true;
                }
            }

            if (!hasAtLeastOneFieldsFilled)
            {
                return(null);
            }

            var patientNumber = ParseInt(row[0]) ?? rowIndex;
            var gender        = GenderParser.Parse(TextNormalizer.ToSafeText(row[1]));
            var age           = AgeParser.Parse(TextNormalizer.ToSafeText(row[2]));

            var domicile         = TextNormalizer.ToSafeText(row[3]);
            var infectionContact = TextNormalizer.ToSafeText(row[4]);
            var confirmedOn      = DateParser.Parse(TextNormalizer.ToSafeText(row[5]));

            return(new PatientInfo
            {
                PatientNumber = patientNumber,
                Gender = gender,
                Domicile = domicile.ToUpper(),
                InfectionContact = infectionContact,
                InfectionSourceType = InfectionSourceParser.Parse(infectionContact),
                Age = age,
                ConfirmedOn = confirmedOn
            });
        }
Beispiel #13
0
        /// <summary>
        /// Get tokens from a string.
        /// </summary>
        /// <param name="data">String.</param>
        /// <param name="options">Text parse options.</param>
        /// <returns>List of tokens.</returns>
        public static List <Token> GetTokens(string data, TextParseOptions options)
        {
            if (options == null)
            {
                throw new ArgumentNullException(nameof(options));
            }
            Dictionary <string, Token> dict = new Dictionary <string, Token>();
            List <Token>  ret       = new List <Token>();
            List <string> tokenList = new List <string>();

            tokenList = new List <string>(data.Split(options.SplitCharacters, StringSplitOptions.RemoveEmptyEntries));

            if (tokenList != null && tokenList.Count > 0)
            {
                for (int i = 0; i < tokenList.Count; i++)
                {
                    if (String.IsNullOrEmpty(tokenList[i]))
                    {
                        continue;
                    }

                    string token = tokenList[i];
                    if (options.TokenManipulation.SetLowerCase)
                    {
                        token = token.ToLower();
                    }
                    if (options.TokenManipulation.ReduceWhitespace)
                    {
                        token = TextNormalizer.ReduceWhitespace(token);
                    }
                    if (options.TokenManipulation.RemovePunctuation)
                    {
                        token = TextNormalizer.RemovePunctuation(options.PunctuationCharacters, token);
                    }
                    if (options.TokenManipulation.RemoveNumbers)
                    {
                        token = TextNormalizer.RemoveNumbers(token);
                    }
                    if (options.TokenManipulation.RemoveStopWords)
                    {
                        token = TextNormalizer.RemoveStopWords(options.StopWords, token);
                    }
                    if (token.Length < options.TokenLength.Min)
                    {
                        continue;
                    }
                    if (token.Length > options.TokenLength.Max)
                    {
                        continue;
                    }
                    if (options.StopWords.Contains(token))
                    {
                        continue;
                    }

                    Token t = new Token();
                    t.Value = token;
                    t.Count = 1;
                    t.Positions.Add(i);

                    if (dict.ContainsKey(t.Value))
                    {
                        Token orig    = dict[t.Value];
                        Token replace = new Token();
                        replace.Value     = orig.Value;
                        replace.Count     = orig.Count + 1;
                        replace.Positions = new List <long>();

                        if (t.Positions != null && t.Positions.Count > 0)
                        {
                            replace.Positions.Add(i);
                            replace.Positions.AddRange(orig.Positions);
                        }

                        dict.Remove(t.Value);
                        dict.Add(replace.Value, replace);
                    }
                    else
                    {
                        dict.Add(t.Value, t);
                    }
                }
            }

            if (dict != null && dict.Count > 0)
            {
                ret = dict.Values.ToList().OrderByDescending(u => u.Count).ToList();
            }

            return(ret);
        }
Beispiel #14
0
        /*
         * @param0 documentFile input
         * @param1 documentVectorFile output
         * @param2
         * @param3
         * @param4
         */
        public static void CalculateTweetsCDSSM(string documentFile,
                                                string documentVectorFile,
                                                string queryModelFile,
                                                string docModelFile,
                                                string stopWordsFile)
        {
            /*
             * string documentFile = //args[0];
             * @"D:\News_Team\Query-author-in-Twitter\recent-filtered-tweets.tsv";
             * string documentVectorFile = //args[1];
             * @"D:\News_Team\Query-author-in-Twitter\recent-filtered-CDSSM-tweets.tsv";
             * string queryModelFile = //args[2];
             * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\Model\QuerySideCDSSM.txt";
             * string docModelFile = //args[3];
             * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\Model\DocSideCDSSM.txt";
             * string stopWordsFile = //args[4];
             * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\StopWords.txt";
             */


            DeepSemanticModel docModel = new DeepSemanticModel();



            if (!docModel.ModelLoader(docModelFile))
            {
                Console.WriteLine("Error: CDSSM document-side model load failed ...");
                return;
            }

            if (!TextNormalizer.StopWordLoader(stopWordsFile))
            {
                Console.WriteLine("Error: stop words list load failed ...");
                return;
            }

            StreamWriter sw = new StreamWriter(documentVectorFile);
            //sw.AutoFlush = true;
            int count = 0, preSize = -1;

            using (StreamReader sr = new StreamReader(documentFile))
            {
                string items;
                while (null != (items = sr.ReadLine()))
                {
                    string[] cols = items.Split('\t');
                    if (cols.Length < 8)
                    {
                        continue;
                    }
                    string       doc           = cols[3];
                    List <float> CDSSMFeature  = new List <float>();
                    string       normalizedDoc = TextNormalizer.Normalizer(TextNormalizer.StopWordsFilter(doc.Replace('-', ' ').Replace('\'', ' ').ToLowerInvariant().Trim()));

                    if (!docModel.SemanticFeatureEmbedding(normalizedDoc, out CDSSMFeature))
                    {
                        Console.WriteLine("Error: document {0} embedding failed ...", doc);
                        continue;
                    }
                    else
                    {
                        sw.WriteLine("{0}\t{1}", items, string.Join(",", CDSSMFeature));
                        if (count % 100 == 0)
                        {
                            Console.WriteLine("{0}", count);
                        }
                        if (preSize != -1 && preSize != CDSSMFeature.Count)
                        {
                            Console.WriteLine(preSize + " " + CDSSMFeature.Count);
                        }
                        preSize = CDSSMFeature.Count;
                    }
                    count++;
                }
                sr.Close();
            }
            Console.WriteLine("CDSSM count {0}", preSize);
            sw.Close();
        }