private void button4_Click(object sender, EventArgs e) { number = int.Parse(textBox1.Text); wb = new WordBag[number]; for (int a = 0; a < number; a++) { if (openFileDialog1.ShowDialog() == DialogResult.OK) { wb[a] = new WordBag(); Encoding encode = System.Text.Encoding.GetEncoding("ks_c_5601-1987"); FileStream fs2 = new FileStream(openFileDialog1.FileName, FileMode.Open, FileAccess.Read); string dataFileName = openFileDialog1.FileName; System.IO.StreamReader fs = new StreamReader(openFileDialog1.FileName, encode); string str = fs.ReadLine(); string wholeWords = ""; while (str != null) { wholeWords += str; str = fs.ReadLine(); } string[] bags = wholeWords.Split(' '); wb[a].getBag(bags, 15); wb[a].SortingBag(); } } for (int a = 0; a < number; a++) { WordBag[] temp = new WordBag[number - 1]; int cnt = 0; for (int b = 0; b < number; b++) { if (a != b) { temp[cnt] = wb[b]; cnt++; } } wb[a].get_TD_IDF(temp); } }
public void TestWords(Log log) { var net = new SNeuralNet(6, 2, 300, 1); var wordBag = WordBag.CreateToWords(string.Join(". ", Configuration.RawDataList), 1); var trainSets = new List <Tuple <double[], double[]> >(); var wordsHistory = new List <string>(); var vocab = new LRVocab().Create(Configuration.VocabularyPath, (string s) => log(s)); log("Prepare tests list"); foreach (var word in wordBag.Read()) { var w = word[0]; if (!vocab.Vocabulary.ContainsWord(w)) { continue; } wordsHistory.Add(w); if (wordsHistory.Count < 4) { continue; } if (wordsHistory.Count > 4) { wordsHistory.RemoveAt(0); } double[] input = new double[6]; double[] correct = new double[1]; input[0] = vocab.Vocabulary.GetRepresentationOrNullFor(wordsHistory[0]).MetricLength; input[1] = vocab.Vocabulary.GetRepresentationOrNullFor(wordsHistory[1]).MetricLength; input[2] = vocab.Vocabulary.GetRepresentationOrNullFor(wordsHistory[2]).MetricLength; input[3] = vocab.Vocabulary.GetSummRepresentationOrNullForPhrase(wordsHistory.Take(2).ToArray())?.MetricLength ?? 0d; input[4] = vocab.Vocabulary.GetSummRepresentationOrNullForPhrase(wordsHistory.Skip(1).Take(2).ToArray())?.MetricLength ?? 0d; input[5] = vocab.Vocabulary.GetSummRepresentationOrNullForPhrase(wordsHistory.Take(3).ToArray())?.MetricLength ?? 0d; correct = new double[] { vocab.Vocabulary.GetRepresentationFor(wordsHistory[3]).MetricLength }; trainSets.Add(new Tuple <double[], double[]>(input, correct)); } if (trainSets.Count == 0) { log("No train sets"); return; } log($"Train sets count: {trainSets.Count}"); log($"Train starts"); var trainer = new NeuralNetTrainer() .SetDataSets(trainSets.ToArray()) .SetNet(net); trainer.EpochsCount = 150; trainer.LearnRate = 0.001; trainer.SimpleTrain(); log("Train end"); foreach (var set in trainSets) { log($"Input: {set.Item1[0]} {set.Item1[1]} {set.Item1[2]}\t Result: {net.Activate(set.Item1)}"); } }
public void Test(Log log) { var voc = new LRVocab().Create("D:/vectors/google_vokab.bin", (data) => log(data)); foreach (var p in Phrases) { var wb = WordBag.CreateToWords(p, 3); var pb = WordBag.CreateToPhrases(p, 1); log($"Process phrase '{p}'"); List <Representation> wordsResults = new List <Representation>(); List <Representation> phrasesResults = new List <Representation>(); log("ToWords representation"); foreach (var data in wb.Read()) { var s1 = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data[0]); var s2 = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data[1]); var s3 = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data[2]); Representation[] s = { s1, s2, s3 }; Representation result = null; foreach (var ss in s) { if (ss == null) { continue; } if (result == null) { result = ss; } else { result.Add(ss); } } if (result == null) { continue; } log($"{data[0]} + {data[1]} + {data[2]} => {result.WordOrNull} ({result.MetricLength})"); if (!string.IsNullOrWhiteSpace(result.WordOrNull)) { wordsResults.Add(result); } } log("words results:"); log(string.Join(" ", wordsResults.Select(x => x.WordOrNull))); log(""); log("phrases represuntation"); foreach (var data in pb.Read()) { var result = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data); if (result == null) { continue; } log($"{data[0]} => {result.WordOrNull} ({result.MetricLength})"); if (!string.IsNullOrWhiteSpace(result.WordOrNull)) { phrasesResults.Add(result); } } log("phrase results:"); log(string.Join(" ", phrasesResults.Select(x => x.WordOrNull))); log(""); log("------"); } }
private void HMM_forn_Load(object sender, EventArgs e) { wordbag = new WordBag(); }
protected async Task <TrainSet> ScheduleProcessTwitter(TrainSet data) { var collection = await _twitterCollectionsStore.Get(data.SourceId); if (collection == null) { data.SetFailed("Can not find source data"); return(data); } var sources = await _twitterSourcesStore.GetBy(x => x.CollectionId == collection.Id); if (sources == null || !sources.Any()) { data.SetFailed("Can not find any twitter sources"); return(data); } var user = await _userStore.FindByIdAsync(collection.UserId); if (user == null) { data.SetFailed("Can not find user data"); return(data); } var userTwitter = await _userSocialsStore.GetTwitter(user.Id); if (userTwitter == null) { data.SetFailed("No twitter access token"); return(data); } try { OAuthTwitter(userTwitter); } catch { data.SetFailed("Error with twitter connections"); return(data); } // upload twitter data int min = data.MinCount; min = Math.Max(1, min); int max = data.MaxCount; max = Math.Max(100, max); max = Math.Min(10000, max); if (min > max) { var t = min; max = min; min = t; } int perSource = (int)Math.Ceiling((double)max / sources.Count); var entity = new TrainSetModel { }; var rawData = new StringBuilder(); int total = 0; var regex = new Regex("http[s]?://[A-Za-z0-9._-]*"); foreach (var screen in sources) { long?lastId = null; int count = 0; var twetterUser = await UserAsync.GetUserFromId(screen.TwitterId); while (perSource > count) { var @params = new UserTimelineParameters { MaximumNumberOfTweetsToRetrieve = 50, }; if (lastId.HasValue) { @params.SinceId = lastId.Value; } var tweets = await TimelineAsync.GetUserTimeline(twetterUser, @params); if (tweets == null || !tweets.Any()) { break; } count += tweets.Count(); foreach (var t in tweets) { rawData.Append(regex.Replace(t.FullText, string.Empty)); } } total += count; } if (total < min) { data.SetFailed($"Not enough data avaliable. Avaliable : {total}. Minimum: {min}"); return(data); } WordBag wb = WordBag.CreateToWords(rawData.ToString(), data.InputWordsCount + 1); _vReader.UploadBinary(); List <Tuple <string[], string[]> > stringList = new List <Tuple <string[], string[]> >(); List <Tuple <double[], double[]> > doubleList = new List <Tuple <double[], double[]> >(); foreach (var s in wb.Read()) { var vectorList = new List <double[]>(); var wordList = new List <string>(); foreach (var ss in s) { var word = _vReader.Vocab.GetRepresentationOrNullFor(ss); if (word == null) { break; } vectorList.Add(ss.Select(x => (double)x).ToArray()); wordList.Add(ss); } if (vectorList.Count < s.Length) { continue; } var tmpVector = new List <double>(); foreach (var i in vectorList.Take(data.InputWordsCount)) { tmpVector.AddRange(i); } doubleList.Add(new Tuple <double[], double[]>(tmpVector.ToArray(), vectorList.Last().ToArray())); stringList.Add(new Tuple <string[], string[]>(wordList.Take(wordList.Count - 1).ToArray(), new string[1] { wordList.Last() })); } entity.Data = doubleList.ToArray(); entity.StringSource = stringList.ToArray(); string dataString = JsonConvert.SerializeObject(entity); await _storageBlobClient.SetContainer(CONTAINER_NAME, true); var storageKey = await _storageBlobClient.WriteText(dataString); if (string.IsNullOrWhiteSpace(storageKey)) { data.SetFailed("Can not upload train set to storage"); return(data); } data.StorageKey = storageKey; data.SetReady(); data.ExamplesCount = entity.Data.Count(); return(data); }