Пример #1
0
        public void LoadLDAModel(string modelName, int numOfTopics, DocModelDictionary dictionary)
        {
            FileStream fstream = new FileStream(modelName, FileMode.Open);
            StreamReader reader = new StreamReader(fstream);
            List<KeyValuePair<int, double[]>> beta = new List<KeyValuePair<int, double[]>>();
            string line;
            while ((line = reader.ReadLine()) != null)
            {
                string[] ss = line.Split(':');
                int wordKey = int.Parse(ss[0]);
                double[] betaWord = new double[numOfTopics];
                string[] sss = ss[1].Split(',');
                Debug.Assert(sss.Length == numOfTopics);
                for (int i = 0; i < sss.Length; i++)
                {
                    betaWord[i] = double.Parse(sss[i]);
                }
                beta.Add(new KeyValuePair<int, double[]>(wordKey, betaWord));
            }
            reader.Close();

            if (vocabulary == null)
            {
                vocabulary = new Dictionary<int, string[]>();
            }
            else
            {
                vocabulary.Clear();
            }

            for (int i = 0; i < numOfTopics; i++)
            {
                beta.Sort(
                    (x1, x2) =>
                    {
                        if (x1.Value[i] > x2.Value[i])
                        {
                            return -1;
                        }
                        else if (x1.Value[i] == x2.Value[i])
                        {
                            return 0;
                        }
                        return 1;
                    }
                    );
                string[] wordList = new string[40];
                for (int j = 0; j < 40; j++)
                {
                    wordList[j] = dictionary.GetKey(beta[j].Key);
                }
                vocabulary.Add(i, wordList);
            }
        }
Пример #2
0
 public static string ConvertWords(PrecedenceProperty p, DocModelDictionary dictionary)
 {
     StringBuilder sb = new StringBuilder();
     sb.Append("{");
     for (int i = 0; i < p.p.Length; i++)
     {
         int wordKey = p.p[i];
         sb.Append(" [");
         sb.Append(dictionary.GetKey(wordKey));
         sb.Append("],");
     }
     sb.Remove(sb.Length - 1, 1);
     sb.Append(" }");
     return sb.ToString();
 }
Пример #3
0
        public LDAModel LoadFromDB(BsonDocument doc, DocModelDictionary wordDict)
        {
            DocID = doc["DocID"].AsString;
            if (classLabels != null)
            {
                classLabels.Clear();
            }
            if (!doc["ClassLabels"].IsBsonNull)
            {
                foreach (BsonValue classLabel in doc["ClassLabels"].AsBsonArray)
                {
                    AddClassLabel(classLabel.AsInt32);
                }
            }

            if (wordWeights != null)
            {
                wordWeights.Clear();
            }
            if (!doc["WordCounts"].IsBsonNull)
            {
                foreach (BsonDocument kvp in doc["WordCounts"].AsBsonArray)
                {
                    bool f = true;
                    foreach (BsonElement e in kvp)
                    {
                        int wordKey = int.Parse(e.Name);
                        AddWord(wordKey, e.Value.AsDouble);
                        if (f) { f = false; } else { Debug.Assert(false); }
                    }
                }
            }

            if (wordWeights == null)
            {
                return null;
            }

            return this;
        }
Пример #4
0
 public PrecedenceModel(DocModelDictionary dictionary, DocModelDictionary clsDictionary)
 {
     this.dictionary = dictionary;
     this.clsDictionary = clsDictionary;
 }
Пример #5
0
 public PrecedenceQuery(PModel.PrecedenceModel pModel, DocModelDictionary wordDictionary, DocModelDictionary classLabelDictionary)
 {
     this.pModel = pModel;
     this.wordDictionary = wordDictionary;
     this.classLabelDictionary = classLabelDictionary;
 }
Пример #6
0
 public override void Stats(DocModelDictionary classLabelDict)
 {
     throw new NotImplementedException();
 }
Пример #7
0
 public LDABoWModelDB(int numOfTopics, DocModelDictionary wd)
     : base(wd)
 {
     alpha = 0.2;
     NumOfTopics = numOfTopics;
     //Init();
 }
Пример #8
0
        public override void Stats(DocModelDictionary classLabelDict)
        {
            if (docDB.Count == 0) return;
            Dictionary<int, HashSet<string>> classLabelCounts = new Dictionary<int, HashSet<string>>();
            Dictionary<int, int> wordCounts = new Dictionary<int, int>();
            for (int i = 0; i < docDB.Count; i++)
            {
                BoWModel doc = ((BoWModel)docDB[i]);
                if (doc.ClassLabels != null)
                {
                    foreach (int k in doc.ClassLabels)
                    {
                        //string key = classLabelDict.GetKey(k);
                        HashSet<string> docIds;
                        if (!classLabelCounts.TryGetValue(k, out docIds))
                        {
                            docIds = new HashSet<string>();
                            classLabelCounts.Add(k, docIds);
                        }
                        docIds.Add(doc.DocID);
                    }
                }
                for (int n = 0; n < doc.Length; n++)
                {
                    int count = 0;
                    //string key = wordDict.GetKey(doc.Word(n));
                    if (wordCounts.TryGetValue(doc.Word(n), out count))
                    {
                        count += doc.Count(n);
                        wordCounts[doc.Word(n)] = count;
                    }
                    else
                    {
                        wordCounts.Add(doc.Word(n), doc.Count(n));
                    }
                }
            }
            List<KeyValuePair<int, HashSet<string>>> orderedCLCounts = classLabelCounts.ToList();
            orderedCLCounts.Sort(
                (x1, x2) =>
                {
                    if (x1.Value.Count > x2.Value.Count)
                        return -1;
                    else if (x1.Value.Count == x2.Value.Count)
                        return 0;
                    else
                        return 1;
                }
                );
            StreamWriter writer = new StreamWriter(new FileStream("classlabel_stats", FileMode.Create));
            for (int i = 0; i < orderedCLCounts.Count; i++)
            {
                writer.WriteLine("{0} : {1}", classLabelDict.GetKey(orderedCLCounts[i].Key), orderedCLCounts[i].Value.Count);
            }
            writer.Close();

            List<KeyValuePair<int, int>> orderedWordCounts = wordCounts.ToList();
            orderedWordCounts.Sort(
                (x1, x2) =>
                {
                    if (x1.Value > x2.Value)
                        return -1;
                    else if (x1.Value == x2.Value)
                        return 0;
                    else
                        return 1;
                }
                );
            writer = new StreamWriter(new FileStream("word_stats", FileMode.Create));
            for (int i = 0; i < orderedWordCounts.Count; i++)
            {
                writer.WriteLine("{0} : {1}", wordDict.GetKey(orderedWordCounts[i].Key), orderedWordCounts[i].Value);
            }
            writer.Close();
            orderedWordCounts.Clear();
            wordCounts.Clear();

            List<string> candiates = new List<string>();
            for (int i = 0; i < docDB.Count; i++)
            {
                BoWModel doc = ((BoWModel)docDB[i]);
                if (doc.ClassLabels != null)
                {
                    foreach (int k in doc.ClassLabels)
                    {
                        if (classLabelCounts[k].Count > 900)
                        {
                            candiates.Add(doc.DocID);
                        }
                        break;
                    }
                }
            }

            writer = new StreamWriter(new FileStream("doc_stats", FileMode.Create));
            for (int i = 0; i < candiates.Count; i++)
            {
                writer.WriteLine("{0}", candiates[i]);
            }
            writer.Close();

            foreach (KeyValuePair<int, HashSet<string>> kvp in classLabelCounts)
            {
                if (kvp.Value.Count > 900)
                {
                    HashSet<string> training = new HashSet<string>();
                    HashSet<string> crsvalid = new HashSet<string>();
                    HashSet<string> testing = new HashSet<string>();
                    int trainingSize = (int)(kvp.Value.Count * 0.7);
                    int crsvalidSize = (int)(kvp.Value.Count * 0.1);
                    int testingSize = kvp.Value.Count - trainingSize - crsvalidSize;
                    int i = 0;
                    foreach (string s in kvp.Value)
                    {
                        if (i < trainingSize)
                        {
                            training.Add(s);
                        }
                        else if (i < trainingSize + crsvalidSize)
                        {
                            crsvalid.Add(s);
                        }
                        else
                        {
                            testing.Add(s);
                        }
                        i++;
                    }
                    HashSet<string> usedDocs = new HashSet<string>();

                    RandomlyFill(training, kvp.Value, usedDocs, candiates, (int)(trainingSize/0.7));
                    RandomlyFill(crsvalid, kvp.Value, usedDocs, candiates, 1000);
                    RandomlyFill(testing, kvp.Value, usedDocs, candiates, 2000);

                    writer = new StreamWriter(new FileStream("training_data\\doc_training_stats_" + kvp.Key, FileMode.Create));
                    foreach(string s in training)
                    {
                        writer.WriteLine("{0}", s);
                    }
                    writer.Close();

                    writer = new StreamWriter(new FileStream("crsvalid_data\\doc_crsvalid_stats_" + kvp.Key, FileMode.Create));
                    foreach (string s in crsvalid)
                    {
                        writer.WriteLine("{0}", s);
                    }
                    writer.Close();

                    writer = new StreamWriter(new FileStream("testing_data\\doc_testing_stats_" + kvp.Key, FileMode.Create));
                    foreach (string s in testing)
                    {
                        writer.WriteLine("{0}", s);
                    }
                    writer.Close();
                }
            }
        }
Пример #9
0
 public abstract void Stats(DocModelDictionary classLabelDict);
Пример #10
0
 public DocModelDB(DocModelDictionary wd)
 {
     server = MongoServer.Create();
     db = server.GetDatabase(DBName);
     coll = db.GetCollection<BsonDocument>(CollectionName);
     wordDict = wd;
     docDB = new List<DocModel>();
 }
Пример #11
0
 public BoWModelDB(DocModelDictionary wd)
     : base(wd)
 {
 }