protected async Task <TrainSet> ScheduleProcessNotImplemented(TrainSet data) { data.SetFailed("Not implimented yet"); return(data); }
protected async Task <TrainSet> ScheduleProcessTwitter(TrainSet data) { var collection = await _twitterCollectionsStore.Get(data.SourceId); if (collection == null) { data.SetFailed("Can not find source data"); return(data); } var sources = await _twitterSourcesStore.GetBy(x => x.CollectionId == collection.Id); if (sources == null || !sources.Any()) { data.SetFailed("Can not find any twitter sources"); return(data); } var user = await _userStore.FindByIdAsync(collection.UserId); if (user == null) { data.SetFailed("Can not find user data"); return(data); } var userTwitter = await _userSocialsStore.GetTwitter(user.Id); if (userTwitter == null) { data.SetFailed("No twitter access token"); return(data); } try { OAuthTwitter(userTwitter); } catch { data.SetFailed("Error with twitter connections"); return(data); } // upload twitter data int min = data.MinCount; min = Math.Max(1, min); int max = data.MaxCount; max = Math.Max(100, max); max = Math.Min(10000, max); if (min > max) { var t = min; max = min; min = t; } int perSource = (int)Math.Ceiling((double)max / sources.Count); var entity = new TrainSetModel { }; var rawData = new StringBuilder(); int total = 0; var regex = new Regex("http[s]?://[A-Za-z0-9._-]*"); foreach (var screen in sources) { long?lastId = null; int count = 0; var twetterUser = await UserAsync.GetUserFromId(screen.TwitterId); while (perSource > count) { var @params = new UserTimelineParameters { MaximumNumberOfTweetsToRetrieve = 50, }; if (lastId.HasValue) { @params.SinceId = lastId.Value; } var tweets = await TimelineAsync.GetUserTimeline(twetterUser, @params); if (tweets == null || !tweets.Any()) { break; } count += tweets.Count(); foreach (var t in tweets) { rawData.Append(regex.Replace(t.FullText, string.Empty)); } } total += count; } if (total < min) { data.SetFailed($"Not enough data avaliable. Avaliable : {total}. Minimum: {min}"); return(data); } WordBag wb = WordBag.CreateToWords(rawData.ToString(), data.InputWordsCount + 1); _vReader.UploadBinary(); List <Tuple <string[], string[]> > stringList = new List <Tuple <string[], string[]> >(); List <Tuple <double[], double[]> > doubleList = new List <Tuple <double[], double[]> >(); foreach (var s in wb.Read()) { var vectorList = new List <double[]>(); var wordList = new List <string>(); foreach (var ss in s) { var word = _vReader.Vocab.GetRepresentationOrNullFor(ss); if (word == null) { break; } vectorList.Add(ss.Select(x => (double)x).ToArray()); wordList.Add(ss); } if (vectorList.Count < s.Length) { continue; } var tmpVector = new List <double>(); foreach (var i in vectorList.Take(data.InputWordsCount)) { tmpVector.AddRange(i); } doubleList.Add(new Tuple <double[], double[]>(tmpVector.ToArray(), vectorList.Last().ToArray())); stringList.Add(new Tuple <string[], string[]>(wordList.Take(wordList.Count - 1).ToArray(), new string[1] { wordList.Last() })); } entity.Data = doubleList.ToArray(); entity.StringSource = stringList.ToArray(); string dataString = JsonConvert.SerializeObject(entity); await _storageBlobClient.SetContainer(CONTAINER_NAME, true); var storageKey = await _storageBlobClient.WriteText(dataString); if (string.IsNullOrWhiteSpace(storageKey)) { data.SetFailed("Can not upload train set to storage"); return(data); } data.StorageKey = storageKey; data.SetReady(); data.ExamplesCount = entity.Data.Count(); return(data); }