Пример #1
0
 protected async Task <TrainSet> ScheduleProcessNotImplemented(TrainSet data)
 {
     data.SetFailed("Not implimented yet");
     return(data);
 }
Пример #2
0
        protected async Task <TrainSet> ScheduleProcessTwitter(TrainSet data)
        {
            var collection = await _twitterCollectionsStore.Get(data.SourceId);

            if (collection == null)
            {
                data.SetFailed("Can not find source data");
                return(data);
            }

            var sources = await _twitterSourcesStore.GetBy(x => x.CollectionId == collection.Id);

            if (sources == null || !sources.Any())
            {
                data.SetFailed("Can not find any twitter sources");
                return(data);
            }

            var user = await _userStore.FindByIdAsync(collection.UserId);

            if (user == null)
            {
                data.SetFailed("Can not find user data");
                return(data);
            }

            var userTwitter = await _userSocialsStore.GetTwitter(user.Id);

            if (userTwitter == null)
            {
                data.SetFailed("No twitter access token");
                return(data);
            }

            try
            {
                OAuthTwitter(userTwitter);
            }
            catch
            {
                data.SetFailed("Error with twitter connections");
                return(data);
            }

            // upload twitter data

            int min = data.MinCount;

            min = Math.Max(1, min);

            int max = data.MaxCount;

            max = Math.Max(100, max);
            max = Math.Min(10000, max);

            if (min > max)
            {
                var t = min;
                max = min;
                min = t;
            }

            int perSource = (int)Math.Ceiling((double)max / sources.Count);
            var entity    = new TrainSetModel {
            };

            var rawData = new StringBuilder();
            int total   = 0;

            var regex = new Regex("http[s]?://[A-Za-z0-9._-]*");

            foreach (var screen in sources)
            {
                long?lastId      = null;
                int  count       = 0;
                var  twetterUser = await UserAsync.GetUserFromId(screen.TwitterId);

                while (perSource > count)
                {
                    var @params = new UserTimelineParameters
                    {
                        MaximumNumberOfTweetsToRetrieve = 50,
                    };

                    if (lastId.HasValue)
                    {
                        @params.SinceId = lastId.Value;
                    }

                    var tweets = await TimelineAsync.GetUserTimeline(twetterUser, @params);

                    if (tweets == null || !tweets.Any())
                    {
                        break;
                    }

                    count += tweets.Count();
                    foreach (var t in tweets)
                    {
                        rawData.Append(regex.Replace(t.FullText, string.Empty));
                    }
                }

                total += count;
            }

            if (total < min)
            {
                data.SetFailed($"Not enough data avaliable. Avaliable : {total}. Minimum: {min}");
                return(data);
            }

            WordBag wb = WordBag.CreateToWords(rawData.ToString(), data.InputWordsCount + 1);

            _vReader.UploadBinary();

            List <Tuple <string[], string[]> > stringList = new List <Tuple <string[], string[]> >();
            List <Tuple <double[], double[]> > doubleList = new List <Tuple <double[], double[]> >();

            foreach (var s in wb.Read())
            {
                var vectorList = new List <double[]>();
                var wordList   = new List <string>();

                foreach (var ss in s)
                {
                    var word = _vReader.Vocab.GetRepresentationOrNullFor(ss);
                    if (word == null)
                    {
                        break;
                    }

                    vectorList.Add(ss.Select(x => (double)x).ToArray());
                    wordList.Add(ss);
                }

                if (vectorList.Count < s.Length)
                {
                    continue;
                }

                var tmpVector = new List <double>();
                foreach (var i in vectorList.Take(data.InputWordsCount))
                {
                    tmpVector.AddRange(i);
                }

                doubleList.Add(new Tuple <double[], double[]>(tmpVector.ToArray(), vectorList.Last().ToArray()));
                stringList.Add(new Tuple <string[], string[]>(wordList.Take(wordList.Count - 1).ToArray(), new string[1] {
                    wordList.Last()
                }));
            }

            entity.Data         = doubleList.ToArray();
            entity.StringSource = stringList.ToArray();

            string dataString = JsonConvert.SerializeObject(entity);

            await _storageBlobClient.SetContainer(CONTAINER_NAME, true);

            var storageKey = await _storageBlobClient.WriteText(dataString);

            if (string.IsNullOrWhiteSpace(storageKey))
            {
                data.SetFailed("Can not upload train set to storage");
                return(data);
            }

            data.StorageKey = storageKey;
            data.SetReady();

            data.ExamplesCount = entity.Data.Count();

            return(data);
        }