示例#1
0
        private void button4_Click(object sender, EventArgs e)
        {
            number = int.Parse(textBox1.Text);

            wb = new WordBag[number];


            for (int a = 0; a < number; a++)
            {
                if (openFileDialog1.ShowDialog() == DialogResult.OK)
                {
                    wb[a] = new WordBag();

                    Encoding encode = System.Text.Encoding.GetEncoding("ks_c_5601-1987");

                    FileStream fs2 = new FileStream(openFileDialog1.FileName, FileMode.Open, FileAccess.Read);


                    string dataFileName       = openFileDialog1.FileName;
                    System.IO.StreamReader fs = new StreamReader(openFileDialog1.FileName, encode);

                    string str        = fs.ReadLine();
                    string wholeWords = "";

                    while (str != null)
                    {
                        wholeWords += str;

                        str = fs.ReadLine();
                    }

                    string[] bags = wholeWords.Split(' ');

                    wb[a].getBag(bags, 15);
                    wb[a].SortingBag();
                }
            }

            for (int a = 0; a < number; a++)
            {
                WordBag[] temp = new WordBag[number - 1];

                int cnt = 0;

                for (int b = 0; b < number; b++)
                {
                    if (a != b)
                    {
                        temp[cnt] = wb[b];
                        cnt++;
                    }
                }

                wb[a].get_TD_IDF(temp);
            }
        }
示例#2
0
        public void TestWords(Log log)
        {
            var net     = new SNeuralNet(6, 2, 300, 1);
            var wordBag = WordBag.CreateToWords(string.Join(". ", Configuration.RawDataList), 1);

            var trainSets = new List <Tuple <double[], double[]> >();

            var wordsHistory = new List <string>();
            var vocab        = new LRVocab().Create(Configuration.VocabularyPath, (string s) => log(s));

            log("Prepare tests list");

            foreach (var word in wordBag.Read())
            {
                var w = word[0];

                if (!vocab.Vocabulary.ContainsWord(w))
                {
                    continue;
                }

                wordsHistory.Add(w);
                if (wordsHistory.Count < 4)
                {
                    continue;
                }
                if (wordsHistory.Count > 4)
                {
                    wordsHistory.RemoveAt(0);
                }

                double[] input   = new double[6];
                double[] correct = new double[1];

                input[0] = vocab.Vocabulary.GetRepresentationOrNullFor(wordsHistory[0]).MetricLength;
                input[1] = vocab.Vocabulary.GetRepresentationOrNullFor(wordsHistory[1]).MetricLength;
                input[2] = vocab.Vocabulary.GetRepresentationOrNullFor(wordsHistory[2]).MetricLength;

                input[3] = vocab.Vocabulary.GetSummRepresentationOrNullForPhrase(wordsHistory.Take(2).ToArray())?.MetricLength ?? 0d;
                input[4] = vocab.Vocabulary.GetSummRepresentationOrNullForPhrase(wordsHistory.Skip(1).Take(2).ToArray())?.MetricLength ?? 0d;
                input[5] = vocab.Vocabulary.GetSummRepresentationOrNullForPhrase(wordsHistory.Take(3).ToArray())?.MetricLength ?? 0d;

                correct = new double[] { vocab.Vocabulary.GetRepresentationFor(wordsHistory[3]).MetricLength };

                trainSets.Add(new Tuple <double[], double[]>(input, correct));
            }

            if (trainSets.Count == 0)
            {
                log("No train sets");
                return;
            }

            log($"Train sets count: {trainSets.Count}");
            log($"Train starts");

            var trainer = new NeuralNetTrainer()
                          .SetDataSets(trainSets.ToArray())
                          .SetNet(net);

            trainer.EpochsCount = 150;
            trainer.LearnRate   = 0.001;

            trainer.SimpleTrain();
            log("Train end");

            foreach (var set in trainSets)
            {
                log($"Input: {set.Item1[0]} {set.Item1[1]} {set.Item1[2]}\t Result: {net.Activate(set.Item1)}");
            }
        }
示例#3
0
        public void Test(Log log)
        {
            var voc = new LRVocab().Create("D:/vectors/google_vokab.bin", (data) => log(data));

            foreach (var p in Phrases)
            {
                var wb = WordBag.CreateToWords(p, 3);
                var pb = WordBag.CreateToPhrases(p, 1);

                log($"Process phrase '{p}'");

                List <Representation> wordsResults   = new List <Representation>();
                List <Representation> phrasesResults = new List <Representation>();


                log("ToWords representation");

                foreach (var data in wb.Read())
                {
                    var s1 = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data[0]);
                    var s2 = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data[1]);
                    var s3 = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data[2]);

                    Representation[] s      = { s1, s2, s3 };
                    Representation   result = null;

                    foreach (var ss in s)
                    {
                        if (ss == null)
                        {
                            continue;
                        }
                        if (result == null)
                        {
                            result = ss;
                        }
                        else
                        {
                            result.Add(ss);
                        }
                    }



                    if (result == null)
                    {
                        continue;
                    }

                    log($"{data[0]} + {data[1]} + {data[2]} => {result.WordOrNull} ({result.MetricLength})");

                    if (!string.IsNullOrWhiteSpace(result.WordOrNull))
                    {
                        wordsResults.Add(result);
                    }
                }

                log("words results:");
                log(string.Join(" ", wordsResults.Select(x => x.WordOrNull)));
                log("");

                log("phrases represuntation");
                foreach (var data in pb.Read())
                {
                    var result = voc.Vocabulary.GetSummRepresentationOrNullForPhrase(data);
                    if (result == null)
                    {
                        continue;
                    }

                    log($"{data[0]} => {result.WordOrNull} ({result.MetricLength})");
                    if (!string.IsNullOrWhiteSpace(result.WordOrNull))
                    {
                        phrasesResults.Add(result);
                    }
                }
                log("phrase results:");
                log(string.Join(" ", phrasesResults.Select(x => x.WordOrNull)));
                log("");
                log("------");
            }
        }
示例#4
0
 private void HMM_forn_Load(object sender, EventArgs e)
 {
     wordbag = new WordBag();
 }
        protected async Task <TrainSet> ScheduleProcessTwitter(TrainSet data)
        {
            var collection = await _twitterCollectionsStore.Get(data.SourceId);

            if (collection == null)
            {
                data.SetFailed("Can not find source data");
                return(data);
            }

            var sources = await _twitterSourcesStore.GetBy(x => x.CollectionId == collection.Id);

            if (sources == null || !sources.Any())
            {
                data.SetFailed("Can not find any twitter sources");
                return(data);
            }

            var user = await _userStore.FindByIdAsync(collection.UserId);

            if (user == null)
            {
                data.SetFailed("Can not find user data");
                return(data);
            }

            var userTwitter = await _userSocialsStore.GetTwitter(user.Id);

            if (userTwitter == null)
            {
                data.SetFailed("No twitter access token");
                return(data);
            }

            try
            {
                OAuthTwitter(userTwitter);
            }
            catch
            {
                data.SetFailed("Error with twitter connections");
                return(data);
            }

            // upload twitter data

            int min = data.MinCount;

            min = Math.Max(1, min);

            int max = data.MaxCount;

            max = Math.Max(100, max);
            max = Math.Min(10000, max);

            if (min > max)
            {
                var t = min;
                max = min;
                min = t;
            }

            int perSource = (int)Math.Ceiling((double)max / sources.Count);
            var entity    = new TrainSetModel {
            };

            var rawData = new StringBuilder();
            int total   = 0;

            var regex = new Regex("http[s]?://[A-Za-z0-9._-]*");

            foreach (var screen in sources)
            {
                long?lastId      = null;
                int  count       = 0;
                var  twetterUser = await UserAsync.GetUserFromId(screen.TwitterId);

                while (perSource > count)
                {
                    var @params = new UserTimelineParameters
                    {
                        MaximumNumberOfTweetsToRetrieve = 50,
                    };

                    if (lastId.HasValue)
                    {
                        @params.SinceId = lastId.Value;
                    }

                    var tweets = await TimelineAsync.GetUserTimeline(twetterUser, @params);

                    if (tweets == null || !tweets.Any())
                    {
                        break;
                    }

                    count += tweets.Count();
                    foreach (var t in tweets)
                    {
                        rawData.Append(regex.Replace(t.FullText, string.Empty));
                    }
                }

                total += count;
            }

            if (total < min)
            {
                data.SetFailed($"Not enough data avaliable. Avaliable : {total}. Minimum: {min}");
                return(data);
            }

            WordBag wb = WordBag.CreateToWords(rawData.ToString(), data.InputWordsCount + 1);

            _vReader.UploadBinary();

            List <Tuple <string[], string[]> > stringList = new List <Tuple <string[], string[]> >();
            List <Tuple <double[], double[]> > doubleList = new List <Tuple <double[], double[]> >();

            foreach (var s in wb.Read())
            {
                var vectorList = new List <double[]>();
                var wordList   = new List <string>();

                foreach (var ss in s)
                {
                    var word = _vReader.Vocab.GetRepresentationOrNullFor(ss);
                    if (word == null)
                    {
                        break;
                    }

                    vectorList.Add(ss.Select(x => (double)x).ToArray());
                    wordList.Add(ss);
                }

                if (vectorList.Count < s.Length)
                {
                    continue;
                }

                var tmpVector = new List <double>();
                foreach (var i in vectorList.Take(data.InputWordsCount))
                {
                    tmpVector.AddRange(i);
                }

                doubleList.Add(new Tuple <double[], double[]>(tmpVector.ToArray(), vectorList.Last().ToArray()));
                stringList.Add(new Tuple <string[], string[]>(wordList.Take(wordList.Count - 1).ToArray(), new string[1] {
                    wordList.Last()
                }));
            }

            entity.Data         = doubleList.ToArray();
            entity.StringSource = stringList.ToArray();

            string dataString = JsonConvert.SerializeObject(entity);

            await _storageBlobClient.SetContainer(CONTAINER_NAME, true);

            var storageKey = await _storageBlobClient.WriteText(dataString);

            if (string.IsNullOrWhiteSpace(storageKey))
            {
                data.SetFailed("Can not upload train set to storage");
                return(data);
            }

            data.StorageKey = storageKey;
            data.SetReady();

            data.ExamplesCount = entity.Data.Count();

            return(data);
        }