예제 #1
0
        public override void Compute()
        {
            TLStopwords stopwords = DevelopmentKit.IO.Stopwords.Import(_config.Path.Absolute);
            Workspace.Store("stopwords", stopwords);

            Logger.Info("Stopwords has been imported from " + _config.Path);
        }
예제 #2
0
        public void CleanArtifactsWithStopwords()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            artifacts.Add(new TLArtifact("id1", "clean these words"));
            artifacts.Add(new TLArtifact("id2", "this has a stopword"));
            artifacts.Add(new TLArtifact("id3", "expression"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();

            stopwords.Add("these");
            stopwords.Add("has");
            stopwords.Add("an");
            stopwords.Add("a");
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();

            artifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            stopwords = (TLStopwords)Workspace.Load("stopwords");

            Assert.AreEqual(artifacts["id1"].Text, "clean words");
            Assert.AreEqual(artifacts["id2"].Text, "this stopword");
            Assert.AreEqual(artifacts["id3"].Text, "expression");
        }
예제 #3
0
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            TLStopwords           stopwords       = (TLStopwords)Workspace.Load("Stopwords");
            TLArtifactsCollection removed         = StopwordsRemover.ProcessArtifacts(listOfArtifacts, stopwords, _config.MinWordLength, _config.RemoveNumbers);

            Workspace.Store("listOfArtifacts", removed);
        }
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            TLStopwords           stopwords       = (TLStopwords)Workspace.Load("stopwords");

            ProcessArtifacts(listOfArtifacts, stopwords);

            Workspace.Store("listOfArtifacts", listOfArtifacts);
        }
        public void NullArtifactsTest()
        {
            Workspace.Store("listOfArtifacts", null);

            TLStopwords stopwords = new TLStopwords();
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
예제 #6
0
        public void IncorrectArtifactsType()
        {
            Workspace.Store("listOfArtifacts", "incorrect type");

            TLStopwords stopwords = new TLStopwords();

            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
예제 #7
0
        public void NullArtifactsTest()
        {
            Workspace.Store("listOfArtifacts", null);

            TLStopwords stopwords = new TLStopwords();

            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
예제 #8
0
 /// <summary>
 /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords
 /// </summary>
 /// <param name="listOfArtifacts">Artifacts collection</param>
 /// <param name="stopwords">Stopwords collection</param>
 /// <param name="minWordLength">Minimum word length</param>
 /// <param name="removeNumbers">Flag to remove numbers</param>
 /// <returns>Processed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers);
         processed.Add(processedArtifact);
     }
     return processed;
 }
예제 #9
0
 public static TLStopwords Import(string filepath)
 {
     TLStopwords stopwords = new TLStopwords();
     TextReader reader = new StreamReader(filepath);
     string line;
     while ((line = reader.ReadLine()) != null)
     {
         line = line.Trim();
         stopwords.Add(line);
     }
     return stopwords;
 }
예제 #10
0
파일: Stopwords.cs 프로젝트: thbin/TraceLab
        public static TLStopwords Import(string filepath)
        {
            TLStopwords stopwords = new TLStopwords();
            TextReader  reader    = new StreamReader(filepath);
            string      line;

            while ((line = reader.ReadLine()) != null)
            {
                line = line.Trim();
                stopwords.Add(line);
            }
            return(stopwords);
        }
예제 #11
0
        public void NullStopwordsTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            artifacts.Add(new TLArtifact("id", "word to clean"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();

            Workspace.Store("stopwords", null);

            TestComponent.Compute();
        }
        public void EmptyArtifactListTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();
            stopwords.Add("one");
            stopwords.Add("word");
            stopwords.Add("to");
            stopwords.Add("add");
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
        public void EmptyStopWordsListTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            artifacts.Add(new TLArtifact("id", "text 1"));
            artifacts.Add(new TLArtifact("id1", "text two"));
            artifacts.Add(new TLArtifact("id2", "text is three"));
            artifacts.Add(new TLArtifact("id3", "text has a the stop word"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
예제 #14
0
        public void EmptyStopWordsListTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            artifacts.Add(new TLArtifact("id", "text 1"));
            artifacts.Add(new TLArtifact("id1", "text two"));
            artifacts.Add(new TLArtifact("id2", "text is three"));
            artifacts.Add(new TLArtifact("id3", "text has a the stop word"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();

            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
예제 #15
0
        public void EmptyArtifactListTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();

            stopwords.Add("one");
            stopwords.Add("word");
            stopwords.Add("to");
            stopwords.Add("add");
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
        private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Recieved null listofArtifacts");
            }

            if (stopwords == null)
            {
                throw new ComponentException("Recieved null stopwords");
            }

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                artifact.Text = PreprocessorStopWords.Process(artifact.Text, stopwords);
            }
        }
예제 #17
0
        public void CleanArtifactsWithStopwords()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            artifacts.Add(new TLArtifact("id1", "clean these words"));
            artifacts.Add(new TLArtifact("id2", "this has a stopword"));
            artifacts.Add(new TLArtifact("id3", "an expression"));

            TLStopwords stopwords = new TLStopwords();
            stopwords.Add("these");
            stopwords.Add("this");

            TLArtifactsCollection processedArtifacts = StopwordsRemover.ProcessArtifacts(artifacts, stopwords, 4, false);

            Assert.AreEqual(processedArtifacts["id1"].Text, "clean words");
            Assert.AreEqual(processedArtifacts["id2"].Text, "stopword");
            Assert.AreEqual(processedArtifacts["id3"].Text, "expression");
        }
        private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Recieved null listofArtifacts");
            }

            if (stopwords == null)
            {
                throw new ComponentException("Recieved null stopwords");
            }

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                artifact.Text = PreprocessorStopWords.Process(artifact.Text, stopwords);
            }
        }
예제 #19
0
        public void CleanArtifactsWithStopwords()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            artifacts.Add(new TLArtifact("id1", "clean these words"));
            artifacts.Add(new TLArtifact("id2", "this has a stopword"));
            artifacts.Add(new TLArtifact("id3", "an expression"));

            TLStopwords stopwords = new TLStopwords();

            stopwords.Add("these");
            stopwords.Add("this");

            TLArtifactsCollection processedArtifacts = StopwordsRemover.ProcessArtifacts(artifacts, stopwords, 4, false);

            Assert.AreEqual(processedArtifacts["id1"].Text, "clean words");
            Assert.AreEqual(processedArtifacts["id2"].Text, "stopword");
            Assert.AreEqual(processedArtifacts["id3"].Text, "expression");
        }
예제 #20
0
        public override void Compute()
        {
            StopwordsImporterConfig config = (StopwordsImporterConfig)this.Configuration;

            if (config.Path == null)
            {
                throw new ComponentException("Path has not been specified.");
            }
            if (!File.Exists(config.Path))
            {
                throw new ComponentException(String.Format("File does not exist '{0}'.", config.Path.Absolute));
            }

            TLStopwords stopwords = StopwordsReader.ReadStopwords(config.Path);

            Workspace.Store("stopwords", stopwords);

            Logger.Info("Stopwords has been imported from " + config.Path);
        }
예제 #21
0
 /// <summary>
 /// Processes a string by removing terms that match the given list of stopwords
 /// </summary>
 /// <param name="textToProcess">Input text</param>
 /// <param name="stopwords">Stopwords collection</param>
 /// <param name="minWordLength">Minimum word length</param>
 /// <param name="removeNumbers">Flag to remove numbers</param>
 /// <returns>Processed text</returns>
 public static string ProcessText(string textToProcess, TLStopwords stopwords, int minWordLength, bool removeNumbers)
 {
     StringBuilder builder = new StringBuilder();
     string result = string.Empty;
     string[] tokens = textToProcess.Split();
     foreach (string token in tokens)
     {
         if (!stopwords.Contains(token) && token.Length >= minWordLength)
         {
             if (removeNumbers && IsNumber(token))
             {
                 continue;
             }
             else
             {
                 builder.AppendFormat("{0} ", token);
             }
         }
     }
     result = builder.ToString().Trim();
     return result;
 }
예제 #22
0
        /// <summary>
        /// Processes a string by removing terms that match the given list of stopwords
        /// </summary>
        /// <param name="textToProcess">Input text</param>
        /// <param name="stopwords">Stopwords collection</param>
        /// <param name="minWordLength">Minimum word length</param>
        /// <param name="removeNumbers">Flag to remove numbers</param>
        /// <returns>Processed text</returns>
        public static string ProcessText(string textToProcess, TLStopwords stopwords, int minWordLength, bool removeNumbers)
        {
            StringBuilder builder = new StringBuilder();
            string        result  = string.Empty;

            string[] tokens = textToProcess.Split();
            foreach (string token in tokens)
            {
                if (!stopwords.Contains(token) && token.Length >= minWordLength)
                {
                    if (removeNumbers && IsNumber(token))
                    {
                        continue;
                    }
                    else
                    {
                        builder.AppendFormat("{0} ", token);
                    }
                }
            }
            result = builder.ToString().Trim();
            return(result);
        }
        public void IncorrectArtifactsType()
        {
            Workspace.Store("listOfArtifacts", "incorrect type");

            TLStopwords stopwords = new TLStopwords();
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
        public void NullStopwordsTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            artifacts.Add(new TLArtifact("id", "word to clean"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();
            Workspace.Store("stopwords", null);

            TestComponent.Compute();
        }
예제 #25
0
        /// <summary>
        /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords
        /// </summary>
        /// <param name="listOfArtifacts">Artifacts collection</param>
        /// <param name="stopwords">Stopwords collection</param>
        /// <param name="minWordLength">Minimum word length</param>
        /// <param name="removeNumbers">Flag to remove numbers</param>
        /// <returns>Processed artifacts</returns>
        public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers)
        {
            TLArtifactsCollection processed = new TLArtifactsCollection();

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
                processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers);
                processed.Add(processedArtifact);
            }
            return(processed);
        }
        public void CleanArtifactsWithStopwords()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            artifacts.Add(new TLArtifact("id1", "clean these words"));
            artifacts.Add(new TLArtifact("id2", "this has a stopword"));
            artifacts.Add(new TLArtifact("id3", "expression"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();
            stopwords.Add("these");
            stopwords.Add("has");
            stopwords.Add("an");
            stopwords.Add("a");
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();

            artifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            stopwords = (TLStopwords)Workspace.Load("stopwords");

            Assert.AreEqual(artifacts["id1"].Text, "clean words");
            Assert.AreEqual(artifacts["id2"].Text, "this stopword");
            Assert.AreEqual(artifacts["id3"].Text, "expression");
        }