public void LoadLDAModel(string modelName, int numOfTopics, DocModelDictionary dictionary) { FileStream fstream = new FileStream(modelName, FileMode.Open); StreamReader reader = new StreamReader(fstream); List<KeyValuePair<int, double[]>> beta = new List<KeyValuePair<int, double[]>>(); string line; while ((line = reader.ReadLine()) != null) { string[] ss = line.Split(':'); int wordKey = int.Parse(ss[0]); double[] betaWord = new double[numOfTopics]; string[] sss = ss[1].Split(','); Debug.Assert(sss.Length == numOfTopics); for (int i = 0; i < sss.Length; i++) { betaWord[i] = double.Parse(sss[i]); } beta.Add(new KeyValuePair<int, double[]>(wordKey, betaWord)); } reader.Close(); if (vocabulary == null) { vocabulary = new Dictionary<int, string[]>(); } else { vocabulary.Clear(); } for (int i = 0; i < numOfTopics; i++) { beta.Sort( (x1, x2) => { if (x1.Value[i] > x2.Value[i]) { return -1; } else if (x1.Value[i] == x2.Value[i]) { return 0; } return 1; } ); string[] wordList = new string[40]; for (int j = 0; j < 40; j++) { wordList[j] = dictionary.GetKey(beta[j].Key); } vocabulary.Add(i, wordList); } }
public static string ConvertWords(PrecedenceProperty p, DocModelDictionary dictionary) { StringBuilder sb = new StringBuilder(); sb.Append("{"); for (int i = 0; i < p.p.Length; i++) { int wordKey = p.p[i]; sb.Append(" ["); sb.Append(dictionary.GetKey(wordKey)); sb.Append("],"); } sb.Remove(sb.Length - 1, 1); sb.Append(" }"); return sb.ToString(); }
public LDAModel LoadFromDB(BsonDocument doc, DocModelDictionary wordDict) { DocID = doc["DocID"].AsString; if (classLabels != null) { classLabels.Clear(); } if (!doc["ClassLabels"].IsBsonNull) { foreach (BsonValue classLabel in doc["ClassLabels"].AsBsonArray) { AddClassLabel(classLabel.AsInt32); } } if (wordWeights != null) { wordWeights.Clear(); } if (!doc["WordCounts"].IsBsonNull) { foreach (BsonDocument kvp in doc["WordCounts"].AsBsonArray) { bool f = true; foreach (BsonElement e in kvp) { int wordKey = int.Parse(e.Name); AddWord(wordKey, e.Value.AsDouble); if (f) { f = false; } else { Debug.Assert(false); } } } } if (wordWeights == null) { return null; } return this; }
public PrecedenceModel(DocModelDictionary dictionary, DocModelDictionary clsDictionary) { this.dictionary = dictionary; this.clsDictionary = clsDictionary; }
public PrecedenceQuery(PModel.PrecedenceModel pModel, DocModelDictionary wordDictionary, DocModelDictionary classLabelDictionary) { this.pModel = pModel; this.wordDictionary = wordDictionary; this.classLabelDictionary = classLabelDictionary; }
public override void Stats(DocModelDictionary classLabelDict) { throw new NotImplementedException(); }
public LDABoWModelDB(int numOfTopics, DocModelDictionary wd) : base(wd) { alpha = 0.2; NumOfTopics = numOfTopics; //Init(); }
public override void Stats(DocModelDictionary classLabelDict) { if (docDB.Count == 0) return; Dictionary<int, HashSet<string>> classLabelCounts = new Dictionary<int, HashSet<string>>(); Dictionary<int, int> wordCounts = new Dictionary<int, int>(); for (int i = 0; i < docDB.Count; i++) { BoWModel doc = ((BoWModel)docDB[i]); if (doc.ClassLabels != null) { foreach (int k in doc.ClassLabels) { //string key = classLabelDict.GetKey(k); HashSet<string> docIds; if (!classLabelCounts.TryGetValue(k, out docIds)) { docIds = new HashSet<string>(); classLabelCounts.Add(k, docIds); } docIds.Add(doc.DocID); } } for (int n = 0; n < doc.Length; n++) { int count = 0; //string key = wordDict.GetKey(doc.Word(n)); if (wordCounts.TryGetValue(doc.Word(n), out count)) { count += doc.Count(n); wordCounts[doc.Word(n)] = count; } else { wordCounts.Add(doc.Word(n), doc.Count(n)); } } } List<KeyValuePair<int, HashSet<string>>> orderedCLCounts = classLabelCounts.ToList(); orderedCLCounts.Sort( (x1, x2) => { if (x1.Value.Count > x2.Value.Count) return -1; else if (x1.Value.Count == x2.Value.Count) return 0; else return 1; } ); StreamWriter writer = new StreamWriter(new FileStream("classlabel_stats", FileMode.Create)); for (int i = 0; i < orderedCLCounts.Count; i++) { writer.WriteLine("{0} : {1}", classLabelDict.GetKey(orderedCLCounts[i].Key), orderedCLCounts[i].Value.Count); } writer.Close(); List<KeyValuePair<int, int>> orderedWordCounts = wordCounts.ToList(); orderedWordCounts.Sort( (x1, x2) => { if (x1.Value > x2.Value) return -1; else if (x1.Value == x2.Value) return 0; else return 1; } ); writer = new StreamWriter(new FileStream("word_stats", FileMode.Create)); for (int i = 0; i < orderedWordCounts.Count; i++) { writer.WriteLine("{0} : {1}", wordDict.GetKey(orderedWordCounts[i].Key), orderedWordCounts[i].Value); } writer.Close(); orderedWordCounts.Clear(); wordCounts.Clear(); List<string> candiates = new List<string>(); for (int i = 0; i < docDB.Count; i++) { BoWModel doc = ((BoWModel)docDB[i]); if (doc.ClassLabels != null) { foreach (int k in doc.ClassLabels) { if (classLabelCounts[k].Count > 900) { candiates.Add(doc.DocID); } break; } } } writer = new StreamWriter(new FileStream("doc_stats", FileMode.Create)); for (int i = 0; i < candiates.Count; i++) { writer.WriteLine("{0}", candiates[i]); } writer.Close(); foreach (KeyValuePair<int, HashSet<string>> kvp in classLabelCounts) { if (kvp.Value.Count > 900) { HashSet<string> training = new HashSet<string>(); HashSet<string> crsvalid = new HashSet<string>(); HashSet<string> testing = new HashSet<string>(); int trainingSize = (int)(kvp.Value.Count * 0.7); int crsvalidSize = (int)(kvp.Value.Count * 0.1); int testingSize = kvp.Value.Count - trainingSize - crsvalidSize; int i = 0; foreach (string s in kvp.Value) { if (i < trainingSize) { training.Add(s); } else if (i < trainingSize + crsvalidSize) { crsvalid.Add(s); } else { testing.Add(s); } i++; } HashSet<string> usedDocs = new HashSet<string>(); RandomlyFill(training, kvp.Value, usedDocs, candiates, (int)(trainingSize/0.7)); RandomlyFill(crsvalid, kvp.Value, usedDocs, candiates, 1000); RandomlyFill(testing, kvp.Value, usedDocs, candiates, 2000); writer = new StreamWriter(new FileStream("training_data\\doc_training_stats_" + kvp.Key, FileMode.Create)); foreach(string s in training) { writer.WriteLine("{0}", s); } writer.Close(); writer = new StreamWriter(new FileStream("crsvalid_data\\doc_crsvalid_stats_" + kvp.Key, FileMode.Create)); foreach (string s in crsvalid) { writer.WriteLine("{0}", s); } writer.Close(); writer = new StreamWriter(new FileStream("testing_data\\doc_testing_stats_" + kvp.Key, FileMode.Create)); foreach (string s in testing) { writer.WriteLine("{0}", s); } writer.Close(); } } }
public abstract void Stats(DocModelDictionary classLabelDict);
public DocModelDB(DocModelDictionary wd) { server = MongoServer.Create(); db = server.GetDatabase(DBName); coll = db.GetCollection<BsonDocument>(CollectionName); wordDict = wd; docDB = new List<DocModel>(); }
public BoWModelDB(DocModelDictionary wd) : base(wd) { }