public override void Compute() { TLStopwords stopwords = DevelopmentKit.IO.Stopwords.Import(_config.Path.Absolute); Workspace.Store("stopwords", stopwords); Logger.Info("Stopwords has been imported from " + _config.Path); }
public void CleanArtifactsWithStopwords() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id1", "clean these words")); artifacts.Add(new TLArtifact("id2", "this has a stopword")); artifacts.Add(new TLArtifact("id3", "expression")); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); stopwords.Add("these"); stopwords.Add("has"); stopwords.Add("an"); stopwords.Add("a"); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); artifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); stopwords = (TLStopwords)Workspace.Load("stopwords"); Assert.AreEqual(artifacts["id1"].Text, "clean words"); Assert.AreEqual(artifacts["id2"].Text, "this stopword"); Assert.AreEqual(artifacts["id3"].Text, "expression"); }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLStopwords stopwords = (TLStopwords)Workspace.Load("Stopwords"); TLArtifactsCollection removed = StopwordsRemover.ProcessArtifacts(listOfArtifacts, stopwords, _config.MinWordLength, _config.RemoveNumbers); Workspace.Store("listOfArtifacts", removed); }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLStopwords stopwords = (TLStopwords)Workspace.Load("stopwords"); ProcessArtifacts(listOfArtifacts, stopwords); Workspace.Store("listOfArtifacts", listOfArtifacts); }
public void NullArtifactsTest() { Workspace.Store("listOfArtifacts", null); TLStopwords stopwords = new TLStopwords(); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); }
public void IncorrectArtifactsType() { Workspace.Store("listOfArtifacts", "incorrect type"); TLStopwords stopwords = new TLStopwords(); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); }
/// <summary> /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <param name="stopwords">Stopwords collection</param> /// <param name="minWordLength">Minimum word length</param> /// <param name="removeNumbers">Flag to remove numbers</param> /// <returns>Processed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers); processed.Add(processedArtifact); } return processed; }
public static TLStopwords Import(string filepath) { TLStopwords stopwords = new TLStopwords(); TextReader reader = new StreamReader(filepath); string line; while ((line = reader.ReadLine()) != null) { line = line.Trim(); stopwords.Add(line); } return stopwords; }
public static TLStopwords Import(string filepath) { TLStopwords stopwords = new TLStopwords(); TextReader reader = new StreamReader(filepath); string line; while ((line = reader.ReadLine()) != null) { line = line.Trim(); stopwords.Add(line); } return(stopwords); }
public void NullStopwordsTest() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id", "word to clean")); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); Workspace.Store("stopwords", null); TestComponent.Compute(); }
public void EmptyArtifactListTest() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); stopwords.Add("one"); stopwords.Add("word"); stopwords.Add("to"); stopwords.Add("add"); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); }
public void EmptyStopWordsListTest() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id", "text 1")); artifacts.Add(new TLArtifact("id1", "text two")); artifacts.Add(new TLArtifact("id2", "text is three")); artifacts.Add(new TLArtifact("id3", "text has a the stop word")); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); }
private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords) { if (listOfArtifacts == null) { throw new ComponentException("Recieved null listofArtifacts"); } if (stopwords == null) { throw new ComponentException("Recieved null stopwords"); } foreach (TLArtifact artifact in listOfArtifacts.Values) { artifact.Text = PreprocessorStopWords.Process(artifact.Text, stopwords); } }
public void CleanArtifactsWithStopwords() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id1", "clean these words")); artifacts.Add(new TLArtifact("id2", "this has a stopword")); artifacts.Add(new TLArtifact("id3", "an expression")); TLStopwords stopwords = new TLStopwords(); stopwords.Add("these"); stopwords.Add("this"); TLArtifactsCollection processedArtifacts = StopwordsRemover.ProcessArtifacts(artifacts, stopwords, 4, false); Assert.AreEqual(processedArtifacts["id1"].Text, "clean words"); Assert.AreEqual(processedArtifacts["id2"].Text, "stopword"); Assert.AreEqual(processedArtifacts["id3"].Text, "expression"); }
public override void Compute() { StopwordsImporterConfig config = (StopwordsImporterConfig)this.Configuration; if (config.Path == null) { throw new ComponentException("Path has not been specified."); } if (!File.Exists(config.Path)) { throw new ComponentException(String.Format("File does not exist '{0}'.", config.Path.Absolute)); } TLStopwords stopwords = StopwordsReader.ReadStopwords(config.Path); Workspace.Store("stopwords", stopwords); Logger.Info("Stopwords has been imported from " + config.Path); }
/// <summary> /// Processes a string by removing terms that match the given list of stopwords /// </summary> /// <param name="textToProcess">Input text</param> /// <param name="stopwords">Stopwords collection</param> /// <param name="minWordLength">Minimum word length</param> /// <param name="removeNumbers">Flag to remove numbers</param> /// <returns>Processed text</returns> public static string ProcessText(string textToProcess, TLStopwords stopwords, int minWordLength, bool removeNumbers) { StringBuilder builder = new StringBuilder(); string result = string.Empty; string[] tokens = textToProcess.Split(); foreach (string token in tokens) { if (!stopwords.Contains(token) && token.Length >= minWordLength) { if (removeNumbers && IsNumber(token)) { continue; } else { builder.AppendFormat("{0} ", token); } } } result = builder.ToString().Trim(); return result; }
/// <summary> /// Processes a string by removing terms that match the given list of stopwords /// </summary> /// <param name="textToProcess">Input text</param> /// <param name="stopwords">Stopwords collection</param> /// <param name="minWordLength">Minimum word length</param> /// <param name="removeNumbers">Flag to remove numbers</param> /// <returns>Processed text</returns> public static string ProcessText(string textToProcess, TLStopwords stopwords, int minWordLength, bool removeNumbers) { StringBuilder builder = new StringBuilder(); string result = string.Empty; string[] tokens = textToProcess.Split(); foreach (string token in tokens) { if (!stopwords.Contains(token) && token.Length >= minWordLength) { if (removeNumbers && IsNumber(token)) { continue; } else { builder.AppendFormat("{0} ", token); } } } result = builder.ToString().Trim(); return(result); }
/// <summary> /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <param name="stopwords">Stopwords collection</param> /// <param name="minWordLength">Minimum word length</param> /// <param name="removeNumbers">Flag to remove numbers</param> /// <returns>Processed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers); processed.Add(processedArtifact); } return(processed); }