public void Createhashtag() { instance = WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "#word", null, POSTags.Instance.NN); Assert.AreEqual(NamedEntities.Hashtag, instance.Entity); instance.Entity = NamedEntities.Date; Assert.AreEqual(NamedEntities.Hashtag, instance.Entity); }
public TwitterProfile Train(TwitterProfile profile, TextContent content) { if (profile == null || content?.Text == null) { throw new ArgumentNullException(); } var regex = new Regex(@"(\.|,| |!|\?)"); var excluded = new Regex(@"(^@|^#|^;|^:|^http|^HTTP|\/|\\|…$|&\S+|^\""$|^\"".*[^\""]$|^[^\""].*\""$|^RT$)"); //TODO move remove @ and # to tweet generator instead of trainer var words = regex.Split(content.Text) .Where(word => !string.IsNullOrWhiteSpace(word)) .Where(word => !excluded.IsMatch(word)) .Select(word => new Word(word)).ToList(); WordOccurrence lastWordOccurrence = null; Word temporaryWord; foreach (var word in words) { if (profile.Vocabulary.Any(w => w.Equals(word))) { temporaryWord = profile.Vocabulary.SingleOrDefault(w => w.Equals(word)); } else if (_wordRepository.Get(word) != null) { temporaryWord = _wordRepository.Get(word); } else { temporaryWord = _wordRepository.Add(word); } if (temporaryWord == null) { continue; } var currentWordOccurrence = profile.Words.SingleOrDefault(wo => wo.Word == temporaryWord); if (currentWordOccurrence == null) { currentWordOccurrence = profile.AddWord(temporaryWord); } else { currentWordOccurrence.Occurrence++; } lastWordOccurrence?.AddOccurrence(currentWordOccurrence.Word); lastWordOccurrence = currentWordOccurrence; } return(profile); }
public void CreateArguments() { Assert.Throws <ArgumentNullException>(() => WordOccurrence.Create(null, helper.RawTextExractor.Object, helper.InquirerManager.Object, "Test", null, POSTags.Instance.NN)); Assert.Throws <ArgumentNullException>(() => WordOccurrence.Create(helper.Handler.Object, null, helper.InquirerManager.Object, "Test", null, POSTags.Instance.NN)); Assert.Throws <ArgumentNullException>(() => WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, null, "Test", null, POSTags.Instance.NN)); Assert.Throws <ArgumentException>(() => WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, null, null, POSTags.Instance.NN)); Assert.Throws <ArgumentNullException>(() => WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "Test", null, null)); Assert.Throws <ArgumentException>(() => WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "Test", null, POSTags.Instance.SBAR)); }
public void Setup() { helper = new WordsHandlerHelper(); helper.RawTextExractor.Setup(item => item.GetWord("test")).Returns("T"); helper.Handler.Setup(item => item.CheckSentiment(It.IsAny <WordOccurrence>())).Returns(new SentimentValue(new TestWordItem(), 2)); helper.Handler.Setup(item => item.IsStop(It.IsAny <WordOccurrence>())).Returns(true); helper.Handler.Setup(item => item.MeasureQuantifier(It.IsAny <WordOccurrence>())).Returns(2); instance = WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "Test", null, POSTags.Instance.NN); }
public void Add() { Assert.Throws <ArgumentNullException>(() => instance.Add(null)); instance.Add(WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "Test", null, POSTags.Instance.NN)); instance.Add(WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "Test", null, POSTags.Instance.NN)); Assert.AreEqual("test test", instance.Text); Assert.AreEqual("t t", instance.Stemmed); Assert.AreEqual("NN", instance.POS.Tag); Assert.IsNotNull(instance.Relationship); Assert.IsFalse(instance.IsSentiment); Assert.IsFalse(instance.IsTopAttribute); Assert.AreEqual(2, instance.QuantValue); Assert.IsFalse(instance.IsQuestion); Assert.IsFalse(instance.IsFeature); Assert.IsFalse(instance.IsFixed); Assert.IsFalse(instance.IsStopWord); Assert.IsFalse(instance.IsSimple); Assert.AreEqual(2, instance.AllWords.Count()); }
public void CreateFixed() { instance = WordOccurrence.Create(helper.Handler.Object, helper.RawTextExractor.Object, helper.InquirerManager.Object, "xxxbad", null, POSTags.Instance.NN); Assert.IsTrue(instance.IsFixed); }
private static void BuildFrequencyDictionary() { var result = new FrequencyResults(); Console.WriteLine("How many sentences do you want to parse?"); var nbOfSentencesToParse = int.Parse(Console.ReadLine()); var nbOfAlreadyParsedSentences = 0; var frequencyDirectory = Utilities.PathToDownloadDirectory + "frequencies"; if (!Directory.Exists(frequencyDirectory)) { Directory.CreateDirectory(frequencyDirectory); } var frequencyFilePath = frequencyDirectory + "/frequencies.txt"; var excludedFrequencyFilePath = frequencyDirectory + "/excluded-frequencies.txt"; var nbOfSentencesParsedFilePath = frequencyDirectory + "/nbOfSentencesParsed.txt"; var parsingResumed = false; if (File.Exists(nbOfSentencesParsedFilePath)) { int nbOfSentencesParsed; if (int.TryParse(File.ReadAllText(nbOfSentencesParsedFilePath), out nbOfSentencesParsed)) { Console.WriteLine("{0} sentences have already been parsed. Resume parsing? (y/n)", nbOfSentencesParsed); var resumeParsing = string.Equals(Console.ReadLine(), "Y", StringComparison.InvariantCultureIgnoreCase); if (resumeParsing) { nbOfAlreadyParsedSentences = nbOfSentencesParsed; parsingResumed = true; } } } var sw = Stopwatch.StartNew(); Console.WriteLine("Building of frequency dictionary started"); // Tokenize the sentences and compute the frequencies Func <string[], int, bool> extractTokens = (tokens, sentenceCounter) => { for (var i = 0; i < tokens.Length; i++) { var wordOccurence = new WordOccurrence() { IsFirstTokenInSentence = i == 0, Word = tokens[i] }; result.AddOccurence(wordOccurence); } return(true); }; Utilities.ExtractTokensFromTxtFiles(extractTokens, nbOfSentencesToParse, nbOfAlreadyParsedSentences); // Load previous frequency dictionaries that were already computed if (parsingResumed) { result.LoadFrequencyDictionary(frequencyFilePath); result.LoadFrequencyDictionary(excludedFrequencyFilePath); } // Save frequency files on disk result.SaveFrequencyDictionary(frequencyFilePath); result.SaveExcludedFrequencyDictionary(excludedFrequencyFilePath); // Save the nb of sentences parsed (for information and being able to relaunch the parsing at this point) File.WriteAllText(nbOfSentencesParsedFilePath, nbOfSentencesToParse.ToString()); Console.WriteLine("Building of frequency dictionary done"); Console.WriteLine("====================================="); sw.Stop(); Console.WriteLine("Ellapsed time: {0}", sw.Elapsed.ToString("g")); }
public IDictionaryEnumerator GetWordsByOccurrenceEnumerator() { SortedList sl = new SortedList(); IDictionaryEnumerator enumer = GetWordsAlphabeticallyEnumerator(); while (enumer.MoveNext()) { WordOccurrence wo = new WordOccurrence((int)enumer.Value, (string)enumer.Key); sl.Add(wo, null); } return sl.GetEnumerator(); }
protected override void CalculateRatingLogic() { TextVectorCell[] cells = Review.Vector.GetCells().ToArray(); (double Probability, double Normalization, VectorData Vector)result = Model.GetVector(cells); VectorData vector = result.Vector; if (vector == null || vector.Length == 0) { Rating = Review.CalculateRawRating(); return; } var bias = vector.RHO; var fallbackWeight = 0.1; VectorCell lexicon = default; foreach (VectorCell item in vector.Cells) { var cell = (TextVectorCell)item.Data; if (cell.Name == Constants.RATING_STARS) { lexicon = item; } if (cell.Item != null) { var word = (IWordItem)cell.Item; Add(new SentimentValue(word, word.Text, new SentimentValueData(item.Calculated, SentimentSource.AdjustedSVM))); } else { bias += item.Calculated; } } var notAddedSentiments = new List <SentimentValue>(); foreach (SentimentValue sentimentValue in Review.GetAllSentiments()) { if (!ContainsSentiment(sentimentValue.Owner)) { notAddedSentiments.Add(sentimentValue); } } if (lexicon != null) { var totalWords = Review.GetAllSentiments().Length; fallbackWeight = Math.Abs(lexicon.Theta) / totalWords; } if (notAddedSentiments.Count > 0) { foreach (SentimentValue sentiment in notAddedSentiments) { Add(new SentimentValue(sentiment.Owner, sentiment.Span, new SentimentValueData(sentiment.DataValue.Value * fallbackWeight, SentimentSource.AdjustedCalculated))); } } if (TotalSentiments > 0) { Add(new SentimentValue( WordOccurrence.CreateBasic(Constants.BIAS, POSTags.Instance.JJ), "BIAS", new SentimentValueData(bias, SentimentSource.AdjustedSVM))); } if (Rating.HasValue) { if (Rating.IsPositive.Value && result.Probability < 0.5) { log.LogDebug("Mistmatch in sentiment with machine prediction: {0} - {1}", Rating.IsPositive, result.Probability); } } }