Esempio n. 1
0
        public static List <float> GetCDSSM(string doc, string docModelFile, string stopWordsFile)
        {
            List <float>      CDSSMFeature  = new List <float>();
            DeepSemanticModel dm            = GetDocModel(docModelFile, stopWordsFile);
            string            normalizedDoc = TextNormalizer.Normalizer(TextNormalizer.StopWordsFilter(doc.Replace('-', ' ').Replace('\'', ' ').ToLowerInvariant().Trim()));

            if (!docModel.SemanticFeatureEmbedding(normalizedDoc, out CDSSMFeature))
            {
                Console.WriteLine("Error: document {0} embedding failed ...", doc);
                return(null);
            }
            return(CDSSMFeature);
        }
Esempio n. 2
0
 private static DeepSemanticModel GetDocModel(string docModelFile, string stopWordsFile)
 {
     if (docModel == null)
     {
         docModel = new DeepSemanticModel();
         if (!docModel.ModelLoader(docModelFile))
         {
             Console.WriteLine("Error: CDSSM document-side model load failed ...");
             docModel = null;
             return(docModel);
         }
         if (!TextNormalizer.StopWordLoader(stopWordsFile))
         {
             Console.WriteLine("Error: stop words list load failed ...");
         }
     }
     return(docModel);
 }
Esempio n. 3
0
        /*
         * @param0 documentFile input
         * @param1 documentVectorFile output
         * @param2
         * @param3
         * @param4
         */
        public static void CalculateTweetsCDSSM(string documentFile,
                                                string documentVectorFile,
                                                string queryModelFile,
                                                string docModelFile,
                                                string stopWordsFile)
        {
            /*
             * string documentFile = //args[0];
             * @"D:\News_Team\Query-author-in-Twitter\recent-filtered-tweets.tsv";
             * string documentVectorFile = //args[1];
             * @"D:\News_Team\Query-author-in-Twitter\recent-filtered-CDSSM-tweets.tsv";
             * string queryModelFile = //args[2];
             * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\Model\QuerySideCDSSM.txt";
             * string docModelFile = //args[3];
             * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\Model\DocSideCDSSM.txt";
             * string stopWordsFile = //args[4];
             * @"D:\News_Team\Query-author-in-Twitter\SemanticVectorGenerator\StopWords.txt";
             */


            DeepSemanticModel docModel = new DeepSemanticModel();



            if (!docModel.ModelLoader(docModelFile))
            {
                Console.WriteLine("Error: CDSSM document-side model load failed ...");
                return;
            }

            if (!TextNormalizer.StopWordLoader(stopWordsFile))
            {
                Console.WriteLine("Error: stop words list load failed ...");
                return;
            }

            StreamWriter sw = new StreamWriter(documentVectorFile);
            //sw.AutoFlush = true;
            int count = 0, preSize = -1;

            using (StreamReader sr = new StreamReader(documentFile))
            {
                string items;
                while (null != (items = sr.ReadLine()))
                {
                    string[] cols = items.Split('\t');
                    if (cols.Length < 8)
                    {
                        continue;
                    }
                    string       doc           = cols[3];
                    List <float> CDSSMFeature  = new List <float>();
                    string       normalizedDoc = TextNormalizer.Normalizer(TextNormalizer.StopWordsFilter(doc.Replace('-', ' ').Replace('\'', ' ').ToLowerInvariant().Trim()));

                    if (!docModel.SemanticFeatureEmbedding(normalizedDoc, out CDSSMFeature))
                    {
                        Console.WriteLine("Error: document {0} embedding failed ...", doc);
                        continue;
                    }
                    else
                    {
                        sw.WriteLine("{0}\t{1}", items, string.Join(",", CDSSMFeature));
                        if (count % 100 == 0)
                        {
                            Console.WriteLine("{0}", count);
                        }
                        if (preSize != -1 && preSize != CDSSMFeature.Count)
                        {
                            Console.WriteLine(preSize + " " + CDSSMFeature.Count);
                        }
                        preSize = CDSSMFeature.Count;
                    }
                    count++;
                }
                sr.Close();
            }
            Console.WriteLine("CDSSM count {0}", preSize);
            sw.Close();
        }