public void RarePhrasesTest()
        {
            TLArtifactsCollection collection = new TLArtifactsCollection();
            collection.Add(new TLArtifact("blank", ""));
            collection.Add(new TLArtifact("space", " "));
            collection.Add(new TLArtifact("accents", "à ì")); // à = 133 ì = 141
            collection.Add(new TLArtifact("unrecognized", "╣")); // the debugger sees this value as a square, ascii is 441
            Workspace.Store("listOfArtifacts", collection);
            TestComponent.Compute();

            collection = (TLArtifactsCollection) Workspace.Load("listOfArtifacts");
            if (collection["blank"].Text != "")
            {
                Assert.Fail("blank got '" + collection["blank"].Text + "' when '' was expected");
            }
            if (collection["space"].Text != "")
            {
                Assert.Fail("space got '" + collection["space"].Text + "' when '' was expected");
            }
            if (collection["accents"].Text != "")
            {
                Assert.Fail("accents got '" + collection["accents"].Text + "' when 'à ì' was expected");
            }
            if (collection["unrecognized"].Text != "")
            {
                Assert.Fail("unrecognized got '" + collection["unrecognized"].Text + "' when '╣' was expected");
            }
        }
Esempio n. 2
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public GibbsLDAScript(TLArtifactsCollection source, TLArtifactsCollection target, GibbsLDAConfig config)
     : base()
 {
     _source = source;
     _target = target;
     _config = config;
 }
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            TLArtifactsCollection removed         = SimpleStopwordsRemover.ProcessArtifacts(listOfArtifacts, _config.MinWordLength, _config.RemoveNumbers);

            Workspace.Store("listOfArtifacts", removed);
        }
        public override void Compute()
        {
            TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts");
            TLArtifactsCollection processed = CamelCaseSplitter.ProcessArtifacts(artifacts, _config.ConvertLowercase);

            Workspace.Store("ListOfArtifacts", processed);
        }
        public static void Export(TLArtifactsCollection artifactsCollection, string outputPath, string collectionId, string name, string version, string description)
        {
            if (artifactsCollection == null)
            {
                throw new TraceLabSDK.ComponentException("Received null artifacts collection.");
            }

            System.Xml.XmlWriterSettings settings = new System.Xml.XmlWriterSettings();
            settings.Indent          = true;
            settings.CloseOutput     = true;
            settings.CheckCharacters = true;

            //create file
            using (System.Xml.XmlWriter writer = System.Xml.XmlWriter.Create(outputPath, settings))
            {
                writer.WriteStartDocument();

                writer.WriteStartElement("artifacts_collection");

                WriteCollectionInfo(writer, collectionId, name, version, description);

                WriteArtifacts(artifactsCollection, writer);

                writer.WriteEndElement(); //artifacts_collection

                writer.WriteEndDocument();

                writer.Close();
            }

            System.Diagnostics.Trace.WriteLine("File created , you can find the file " + outputPath);
        }
Esempio n. 6
0
        public void TestTracingOfComponent()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();
            TLDictionaryIndex     dictionary      = new TLDictionaryIndex();

            // TODO: add inputs that matter
            sourceArtifacts.Add(new TLArtifact("id1", "first text"));
            sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with"));
            sourceArtifacts.Add(new TLArtifact("id3", "some more text"));

            targetArtifacts.Add(new TLArtifact("id1", "hello world"));
            targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed"));
            targetArtifacts.Add(new TLArtifact("id3", "yep"));
            targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop"));

            dictionary.AddTermEntry("term", 3, 3, 0.2);

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");

            // TODO: add tests to make sure the output is correctly formatted
            Assert.Fail();
        }
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            TLArtifactsCollection stemmed         = PorterStemmerUtils.ProcessArtifacts(listOfArtifacts);

            Workspace.Store("listOfArtifacts", stemmed);
        }
Esempio n. 8
0
        public override void Compute()
        {
            TLArtifactsCollection artifacts = Importers.Corpus.Import(_config.Identifiers.Absolute, _config.Documents.Absolute);

            Workspace.Store("ListOfArtifacts", artifacts);
            Workspace.Store("NumberOfDocuments", artifacts.Count);
        }
Esempio n. 9
0
        public void CleanArtifactsWithStopwords()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            artifacts.Add(new TLArtifact("id1", "clean these words"));
            artifacts.Add(new TLArtifact("id2", "this has a stopword"));
            artifacts.Add(new TLArtifact("id3", "expression"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();

            stopwords.Add("these");
            stopwords.Add("has");
            stopwords.Add("an");
            stopwords.Add("a");
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();

            artifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            stopwords = (TLStopwords)Workspace.Load("stopwords");

            Assert.AreEqual(artifacts["id1"].Text, "clean words");
            Assert.AreEqual(artifacts["id2"].Text, "this stopword");
            Assert.AreEqual(artifacts["id3"].Text, "expression");
        }
Esempio n. 10
0
        public override void Compute()
        {
            TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts");
            TLArtifactsCollection processed = SemeruSplitter.ProcessArtifacts(artifacts, _config.KeepCompoundIdentifier);

            Workspace.Store("ListOfArtifacts", processed);
        }
Esempio n. 11
0
        private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config)
        {
            if (sourceArtifacts == null)
            {
                throw new ComponentException("Received null sourceArtifacts");
            }

            if (dict == null)
            {
                throw new ComponentException("Received null dictionaryIndex");
            }

            TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix();

            Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric));

            // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search
            foreach (TLArtifact sourceArtifact in sourceArtifacts.Values)
            {
                String query = sourceArtifact.Text;

                // Executes the query
                List <Result> results;
                results = searcher.search(query, dict);

                // Iterates over the results and stores them in the matrix
                foreach (Result r in results)
                {
                    string targetArtifactId = r.ArtifactId;
                    similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking);
                }
            }

            return(similarityMatrix);
        }
Esempio n. 12
0
        public static TLArtifactsCollection Import(String idPath, String docPath)
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            StreamReader idFile  = new StreamReader(idPath);
            StreamReader docFile = new StreamReader(docPath);

            String origid;
            String doc;

            while ((origid = idFile.ReadLine()) != null)
            {
                // read doc
                doc = docFile.ReadLine().Trim();

                // set vars
                String id  = origid.Trim();
                int    num = 0;

                while (artifacts.ContainsKey(id))
                {
                    num++;
                    id = origid.Trim() + "_" + num.ToString();
                }

                artifacts.Add(new TLArtifact(id, doc));
            }

            idFile.Close();
            return(artifacts);
        }
        private static void CreateCSVReport(TLArtifactsCollection artifacts, string outputPath)
        {
            if (artifacts == null)
            {
                throw new ComponentException("Received artifacts collection is null!");
            }

            if (outputPath == null)
            {
                throw new ComponentException("Output path cannot be null.");
            }

            if (!System.IO.Path.IsPathRooted(outputPath))
            {
                throw new ComponentException(String.Format("Absolute output path is required. Given path is '{0}'", outputPath));
            }

            if (outputPath.EndsWith(".csv", StringComparison.CurrentCultureIgnoreCase) == false)
            {
                outputPath = outputPath + ".csv";
            }

            using (System.IO.TextWriter writeFile = new StreamWriter(outputPath))
            {
                WriteArtifactsToFile(artifacts, writeFile);
                writeFile.Flush();
                writeFile.Close();
            }
        }
        public void RarePhrasesTest()
        {
            TLArtifactsCollection collection = new TLArtifactsCollection();

            collection.Add(new TLArtifact("blank", ""));
            collection.Add(new TLArtifact("space", " "));
            collection.Add(new TLArtifact("accents", "à ì"));    // à = 133 ì = 141
            collection.Add(new TLArtifact("unrecognized", "╣")); // the debugger sees this value as a square ascii is 441
            Workspace.Store("listOfArtifacts", collection);
            TestComponent.Compute();

            collection = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            if (collection["blank"].Text != "")
            {
                Assert.Fail("blank got '" + collection["blank"].Text + "' when '' was expected");
            }
            if (collection["space"].Text != "")
            {
                Assert.Fail("space got '" + collection["space"].Text + "' when '' was expected");
            }
            if (collection["accents"].Text != "à ì")
            {
                Assert.Fail("accents got '" + collection["accents"].Text + "' when 'à ì' was expected");
            }
            if (collection["unrecognized"].Text != "╣")
            {
                Assert.Fail("unrecognized got '" + collection["unrecognized"].Text + "' when '╣' was expected");
            }
        }
Esempio n. 15
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="config">Configuration object</param>
 public LDAScript(TLArtifactsCollection source, TLArtifactsCollection target, LDAConfig config)
     : base()
 {
     _source = new TermDocumentMatrix(source);
     _target = new TermDocumentMatrix(target);
     _config = config;
 }
        public static void Export(TLArtifactsCollection artifactsCollection, string outputPath, string collectionId, string name, string version, string description)
        {
            if (artifactsCollection == null)
            {
                throw new TraceLabSDK.ComponentException("Received null artifacts collection.");
            }

            System.Xml.XmlWriterSettings settings = new System.Xml.XmlWriterSettings();
            settings.Indent = true;
            settings.CloseOutput = true;
            settings.CheckCharacters = true;

            //create file
            using (System.Xml.XmlWriter writer = System.Xml.XmlWriter.Create(outputPath, settings))
            {
                writer.WriteStartDocument();

                writer.WriteStartElement("artifacts_collection");

                WriteCollectionInfo(writer, collectionId, name, version, description);

                WriteArtifacts(artifactsCollection, writer);

                writer.WriteEndElement(); //artifacts_collection

                writer.WriteEndDocument();

                writer.Close();
            }

            System.Diagnostics.Trace.WriteLine("File created , you can find the file " + outputPath);
        }
Esempio n. 17
0
        public void EmptyDictionaryIndexTest()
        {
            TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection();

            sourceArtifacts.Add(new TLArtifact("id", "text"));
            TLArtifactsCollection targetArtifacts = new TLArtifactsCollection();

            targetArtifacts.Add(new TLArtifact("id", "text"));
            TLDictionaryIndex dictionary = new TLDictionaryIndex();

            Workspace.Store("sourceArtifacts", sourceArtifacts);
            Workspace.Store("targetArtifacts", targetArtifacts);
            Workspace.Store("dictionaryIndex", dictionary);

            ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching;

            TestComponent.Compute();

            TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix");

            if (simMat == null || simMat.Count != 0)
            {
                Assert.Fail("Similarity Matrix should still be created but have nothing in it");
            }
        }
        public void EmptyCollectionTest()
        {
            TLArtifactsCollection collection = new TLArtifactsCollection();
            Workspace.Store("listOfArtifacts", collection);

            TestComponent.Compute();
        }
        public void CleanListOfArtifacts()
        {
            TLArtifactsCollection collection = new TLArtifactsCollection();

            collection.Add(new TLArtifact("id", "addition"));
            collection.Add(new TLArtifact("id2", "works all arounds"));
            collection.Add(new TLArtifact("id3", "the world is nothing but a huge sphere"));
            Workspace.Store("listOfArtifacts", collection);

            TestComponent.Compute();
            collection = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");

            if (collection["id"].Text != "addit")
            {
                Assert.Fail("id got '" + collection["id"].Text + "' when 'addit' was expected");
            }
            if (collection["id2"].Text != "work all around")
            {
                Assert.Fail("id2 got '" + collection["id2"].Text + "' when 'work all around' was expected");
            }
            if (collection["id3"].Text != "the world is noth but a huge sphere")
            {
                Assert.Fail("id3 got '" + collection["id3"].Text + "' when 'the world is noth but a huge sphere' was expected");
            }
        }
        private static void CreateCSVReport(TLArtifactsCollection artifacts, string outputPath)
        {
            if (artifacts == null)
            {
                throw new ComponentException("Received artifacts collection is null!");
            }

            if (outputPath == null)
            {
                throw new ComponentException("Output path cannot be null.");
            }

            if (!System.IO.Path.IsPathRooted(outputPath))
            {
                throw new ComponentException(String.Format("Absolute output path is required. Given path is '{0}'", outputPath));
            }

            if (outputPath.EndsWith(".csv", StringComparison.CurrentCultureIgnoreCase) == false)
            {
                outputPath = outputPath + ".csv";
            }

            using (System.IO.TextWriter writeFile = new StreamWriter(outputPath))
            {
                WriteArtifactsToFile(artifacts, writeFile);
                writeFile.Flush();
                writeFile.Close();
            }
        }
        public override void Compute()
        {
            TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("sourceArtifacts");
            TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("targetArtifacts");

            if (sourceArtifacts == null)
            {
                throw new ArgumentException("Source artifacts cannot be null.");
            }

            if (targetArtifacts == null)
            {
                throw new ArgumentException("Target artifacts cannot be null.");
            }

            string error;

            //do validation
            if (CoestDatasetImporterHelper.ValidatePath(m_config.FilePath, "Answer Set File", out error))
            {
                var answerMatrix = CoestDatasetImporterHelper.ImportAnswerSet(m_config.FilePath, sourceArtifacts, targetArtifacts, Logger, m_config.TrimElementValues);
                Workspace.Store("answerMatrix", answerMatrix);
                Logger.Trace(String.Format("Answer matrix imported from {0}.", m_config.FilePath));
            }
            else
            {
                throw new ArgumentException(error);
            }
        }
Esempio n. 22
0
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");
            TLArtifactsCollection cleanArtifacts  = Cleanup.ProcessArtifacts(listOfArtifacts, _config.ConvertLowercase);

            Workspace.Store("listOfArtifacts", cleanArtifacts);
        }
Esempio n. 23
0
        public static TLArtifactsCollection Import(String idPath, String docPath)
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            StreamReader idFile = new StreamReader(idPath);
            StreamReader docFile = new StreamReader(docPath);

            String origid;
            String doc;

            while ((origid = idFile.ReadLine()) != null)
            {
                // read doc
                doc = docFile.ReadLine().Trim();

                // set vars
                String id = origid.Trim();
                int num = 0;

                while (artifacts.ContainsKey(id))
                {
                    num++;
                    id = origid.Trim() + "_" + num.ToString();
                }

                artifacts.Add(new TLArtifact(id, doc));
            }

            idFile.Close();
            return artifacts;
        }
Esempio n. 24
0
        public void CleanListOfArtifacts()
        {
            TLArtifactsCollection collection = new TLArtifactsCollection();

            collection.Add(new TLArtifact("id", "this is text"));
            collection.Add(new TLArtifact("id2", "this is text"));
            collection.Add(new TLArtifact("id3", "this is more text"));
            Workspace.Store("listOfArtifacts", collection);

            TestComponent.Compute();
            collection = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");

            if (collection["id"].Text != "this is text")
            {
                Assert.Fail("id got '" + collection["id"].Text + "' when 'this is text' was expected");
            }
            if (collection["id2"].Text != "this is text")
            {
                Assert.Fail("id2 got '" + collection["id2"].Text + "' when 'this is text' was expected");
            }
            if (collection["id3"].Text != "this is more text")
            {
                Assert.Fail("id3 got '" + collection["id3"].Text + "' when 'this is more text' was expected");
            }
        }
        public override void Compute()
        {
            TLArtifactsCollection artifactsCollection = (TLArtifactsCollection)Workspace.Load("artifactsCollection");

            string path, collectionName, id;

            path = (string)Workspace.Load("outputPath");
            if (path.EndsWith(".xml", StringComparison.CurrentCultureIgnoreCase))
            {
                collectionName = path.Remove(path.LastIndexOf(".") + 1);
            }
            else
            {
                collectionName = path;
                path           = path + ".xml";
            }

            collectionName = collectionName.Substring(collectionName.LastIndexOf('\\') + 1);
            id             = collectionName;

            ArtifactsCollectionExporterUtilities.Export(artifactsCollection, path, id, collectionName, "1.1", collectionName);

            Workspace.Store("collectionId", id);

            Logger.Info(String.Format("Artifacts Collection has been saved into xml file '{0}'", path));
        }
Esempio n. 26
0
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts");

            ProcessArtifacts(listOfArtifacts, _config);

            Workspace.Store("listOfArtifacts", listOfArtifacts);
        }
Esempio n. 27
0
        public override void Compute()
        {
            TLArtifactsCollection artifacts = Importers.Corpus.Import(_config.Identifiers.Absolute, _config.Documents.Absolute);
            List <string>         map       = Importers.Corpus.Map(_config.Identifiers.Absolute);

            Workspace.Store("ListOfArtifacts", artifacts);
            Workspace.Store("DocumentMap", map);
        }
Esempio n. 28
0
        public override void Compute()
        {
            TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("SourceArtifacts");
            TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("TargetArtifacts");
            TLSimilarityMatrix    sims            = VSM.Compute(sourceArtifacts, targetArtifacts, _config.WeightingScheme);

            Workspace.Store("Similarities", sims);
        }
        public override void Compute()
        {
            TLArtifactsCollection artifactsCollection = (TLArtifactsCollection)Workspace.Load("artifactsCollection");

            ArtifactsCollectionExporterUtilities.Export(artifactsCollection, Config.Path, Config.CollectionId, Config.CollectionName, Config.CollectionVersion, Config.CollectionDescription);

            Logger.Info(String.Format("Artifacts Collection has been saved into xml file '{0}'", Config.Path.Absolute));
        }
Esempio n. 30
0
        public override void Compute()
        {
            TLSimilarityMatrix    sims    = (TLSimilarityMatrix)Workspace.Load("SimilarityMatrix");
            TLSimilarityMatrix    gold    = (TLSimilarityMatrix)Workspace.Load("GoldSet");
            TLArtifactsCollection queries = (TLArtifactsCollection)Workspace.Load("Queries");

            Effectiveness.Export(queries, sims, gold, _config.AllMethodsFile, _config.BestMethodsFile);
        }
        public override void Compute()
        {
            TLArtifactsCollection artifactsCollection = (TLArtifactsCollection)Workspace.Load("artifactsCollection");

            CreateCSVReport(artifactsCollection, Config.Path.Absolute);

            Logger.Info(String.Format("Artifacts Collection has been saved into csv file '{0}'", Config.Path.Absolute));
        }
        public void EmptyCollectionTest()
        {
            TLArtifactsCollection collection = new TLArtifactsCollection();

            Workspace.Store("listOfArtifacts", collection);

            TestComponent.Compute();
        }
Esempio n. 33
0
        public override void Compute()
        {
            TLArtifactsCollection source = (TLArtifactsCollection)Workspace.Load("SourceArtifacts");
            TLArtifactsCollection target = (TLArtifactsCollection)Workspace.Load("TargetArtifacts");
            TLSimilarityMatrix    sims   = JSD.Compute(source, target);

            Workspace.Store("Similarities", sims);
        }
Esempio n. 34
0
        public override void Compute()
        {
            TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts");

            Models.Vectorizer vec = new Models.Vectorizer(artifacts, _config.Representation);
            Workspace.Store("DocumentVectors", vec.Vectors);
            Workspace.Store("DocumentFrequencies", vec.Frequencies);
        }
        public override void Compute()
        {
            TLArtifactsCollection listOfArtifacts =
                (TLArtifactsCollection)Workspace.Load("listOfArtifacts");

            TLDictionaryIndex dict = BuildDictionary(listOfArtifacts, Logger);

            Workspace.Store("dictionaryIndex", dict);
        }
Esempio n. 36
0
 /// <summary>
 /// Extracts and returns all terms with the specified POS from a TLArtifactsCollection.
 /// </summary>
 /// <param name="artifacts">List of artifacts</param>
 /// <param name="pos">Part of speech to extract</param>
 /// <param name="modelFile">Training model file location</param>
 /// <returns>TLArtifactsCollection consisting of only the terms with the specified POS</returns>
 public static TLArtifactsCollection Extract(TLArtifactsCollection artifacts, POSTaggerSpeechType pos, string modelFile)
 {
     TLArtifactsCollection extracted = new TLArtifactsCollection();
     foreach (KeyValuePair<string, TLArtifact> artifactKVP in artifacts)
     {
         extracted.Add(artifactKVP.Key, ExtractArtifact(artifactKVP.Value, pos, modelFile));
     }
     return extracted;
 }
Esempio n. 37
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="name">Corpus name</param>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 public LDACorpus(string name, TLArtifactsCollection source, TLArtifactsCollection target)
 {
     Name = name;
     TermDocumentMatrix sMatrix = new TermDocumentMatrix(source);
     TermDocumentMatrix tMatrix = new TermDocumentMatrix(target);
     _sourceDocs = sMatrix.DocMap;
     _targetDocs = tMatrix.DocMap;
     _matrix = TermDocumentMatrix.Combine(sMatrix, tMatrix);
 }
Esempio n. 38
0
        public override void Compute()
        {
            TLArtifactsCollection source = (TLArtifactsCollection)Workspace.Load("SourceArtifacts");
            TLArtifactsCollection target = (TLArtifactsCollection)Workspace.Load("TargetArtifacts");
            REngine        engine        = new REngine(_config.RScriptPath);
            GibbsLDAConfig config        = (GibbsLDAConfig)engine.Execute(new GibbsLDA_GAScript(source, target, _config));

            Workspace.Store("GibbsLDAConfig", config);
        }
Esempio n. 39
0
        public override void Compute()
        {
            TLArtifactsCollection source = (TLArtifactsCollection)Workspace.Load("SourceArtifacts");
            TLArtifactsCollection target = (TLArtifactsCollection)Workspace.Load("TargetArtifacts");
            REngine            engine    = new REngine(_config.RScriptPath);
            TLSimilarityMatrix sims      = (TLSimilarityMatrix)engine.Execute(new RTMScript(source, target, _config));

            Workspace.Store("Similarities", sims);
        }
Esempio n. 40
0
 /// <summary>
 /// Exports a corpus in the form (each line):
 /// ID TEXT TEXT TEXT TEXT TEXT ...
 /// </summary>
 /// <param name="artifacts">Artifacts collection</param>
 /// <param name="outputfile">Output file path</param>
 public static void ExportFile(TLArtifactsCollection artifacts, string outputfile)
 {
     TextWriter tw = new StreamWriter(outputfile);
     foreach (TLArtifact artifact in artifacts.Values)
     {
         tw.WriteLine(artifact.Id + " " + artifact.Text.Replace("\n", " ").Replace("\r", String.Empty));
     }
     tw.Flush();
     tw.Close();
 }
Esempio n. 41
0
 private static object PrepareMockData()
 {
     TLArtifactsCollection artifacts = new TLArtifactsCollection();
     int i = 0;
     foreach (string text in new string[] { "artifact text 1", "artifact text 2", "artifact text 3" })
     {
         artifacts.Add(new TLArtifact(i.ToString(), text));
         i++;
     }
     return artifacts;
 }
        private static TLDictionaryIndex BuildDictionary(TLArtifactsCollection listOfArtifacts, ComponentLogger logger)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Received null listOfArtifacts");
            }

            TLDictionaryIndex dict = TFIDFIndexBuilder.build(listOfArtifacts, logger);

            return dict;
        }
        private static void WriteArtifactsToFile(TLArtifactsCollection artifacts, System.IO.TextWriter writeFile)
        {
            //header
            writeFile.WriteLine("Id,Text");

            foreach (TLArtifact artifact in artifacts.Values)
            {
                writeFile.WriteLine("\"{0}\",\"{1}\"", artifact.Id, artifact.Text);
            }

        }
Esempio n. 44
0
 /// <summary>
 /// Processes an artifacts collection using the Snowball stemming algorithm.
 /// </summary>
 /// <param name="artifacts">Artifacts collection</param>
 /// <param name="langauge">Stemmer language</param>
 /// <returns>Stemmed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection artifacts, SnowballStemmerEnum langauge)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in artifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, langauge);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Esempio n. 45
0
 /// <summary>
 /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords
 /// </summary>
 /// <param name="listOfArtifacts">Artifacts collection</param>
 /// <param name="stopwords">Stopwords collection</param>
 /// <param name="minWordLength">Minimum word length</param>
 /// <param name="removeNumbers">Flag to remove numbers</param>
 /// <returns>Processed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Esempio n. 46
0
 /// <summary>
 /// Splits identifiers for each artifact in an artifacts collection
 /// </summary>
 /// <param name="listOfArtifacts"></param>
 /// <param name="keepCompoundIdentifier"></param>
 /// <returns>Splitted artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool keepCompoundIdentifier)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, keepCompoundIdentifier);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Esempio n. 47
0
 /// <summary>
 /// Processes an artifacts collection, splitting CamelCase terms
 /// </summary>
 /// <param name="listOfArtifacts">Artifacts collection</param>
 /// <param name="convertToLowercase">Option to convert resulting terms to lowercase</param>
 /// <returns>Processed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool convertToLowercase)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, convertToLowercase);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Esempio n. 48
0
        public Vectorizer(TLArtifactsCollection artifacts, String representation)
        {
            vectors = new DocumentVectorCollection();
            freq = new DocumentVector("DocumentFrequencies");

            foreach (KeyValuePair<string, TLArtifact> kvp in artifacts)
            {
                // vars
                String docID = kvp.Value.Id;
                String[] words = kvp.Value.Text.Split(' ');

                // create new document representation
                DocumentVector vec = new DocumentVector(docID);
                List<String> addedWords = new List<String>();

                // loop over each word and update its frequency
                foreach (String word in words)
                {
                    // update term-doc frequency only ONCE per document
                    if (!freq.ContainsKey(word))
                    {
                        freq.Add(word, 1);
                        addedWords.Add(word);
                    }
                    else if (!addedWords.Contains(word))
                    {
                        freq[word]++;
                        addedWords.Add(word);
                    }

                    // update word freqency
                    if (!vec.ContainsKey(word))
                    {
                        vec.Add(word, 1);
                    }
                    else
                    {
                        if (representation == "Ordinal")
                        {
                            vec[word]++;
                        }
                    }
                    // update MaxFreq
                    if (vec[word] > vec.MaxFreq.Value)
                    {
                        vec.MaxFreq = new KeyValuePair<string, int>(word, vec[word]);
                    }
                }

                // add document to vector collection
                vectors.Add(vec);
            }
        }
        private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, PreprocessorCleanUpComponentConfig config)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Received null listofArtifacts");
            }

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                artifact.Text = PreprocessorCleanUp.Process(artifact.Text, config.RemoveDigits);
            }
        }
        private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Recieved Null listofArtifacts");
            }

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                artifact.Text = PreprocessorStemmer.Process(artifact.Text);
            }
        }
        /// <summary>
        /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths.
        /// </summary>
        /// <param name="sourceArtifacts">The source artifacts.</param>
        /// <param name="targetArtifacts">The target artifacts.</param>
        /// <param name="dict">The dict.</param>
        /// <param name="ancTermsWeights">The anc terms weights.</param>
        /// <param name="config">The config.</param>
        /// <returns>Similarity matrix with links between source and target artifacts</returns>
        private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts, 
                                                              TLArtifactsCollection targetArtifacts, 
                                                              TLDictionaryIndex dict, 
                                                              TLKeyValuePairsList ancTermsWeights, 
                                                              TracerConfig config)
        {
            if (sourceArtifacts == null)
            {
                throw new ComponentException("Received source artifacts are null!");
            }

            if (targetArtifacts == null)
            {
                throw new ComponentException("Received target artifacts are null!");
            }

            if (dict == null)
            {
                throw new ComponentException("Received dictionary index is null!");
            }

            if (ancTermsWeights == null)
            {
                throw new ComponentException("Received 'ancTermsWeights' is null!");
            }

            TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix();

            
            ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric));

            // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search
            foreach (TLArtifact sourceArtifact in sourceArtifacts.Values)
            {

                String query = sourceArtifact.Text;

                // Executes the query
                List<Result> results;
                results = searcher.search(query, dict, PrepareANCData(ancTermsWeights));

                // Iterates over the results and stores them in the matrix
                foreach (Result r in results)
                {
                    string targetArtifactId = r.ArtifactId;
                    similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking);
                }
            }
            return similarityMatrix;
        }
Esempio n. 52
0
 /// <summary>
 /// Computes cosine similarities between documents via the Vector Space Model.
 /// </summary>
 /// <param name="source">Source artifacts</param>
 /// <param name="target">Target artifacts</param>
 /// <param name="weight">Weighting scheme</param>
 /// <returns>Similarity matrix</returns>
 public static TLSimilarityMatrix Compute(TLArtifactsCollection source, TLArtifactsCollection target, VSMWeightEnum weight)
 {
     switch (weight)
     {
         case VSMWeightEnum.TFIDF:
             return SimilarityUtil.ComputeCosine(WeightUtil.ComputeTFIDF(new TermDocumentMatrix(source, target)), source.Keys, target.Keys);
         case VSMWeightEnum.BooleanQueriesAndTFIDFCorpus:
             return SimilarityUtil.ComputeCosine(WeightUtil.ComputeBinaryTF(new TermDocumentMatrix(source)), WeightUtil.ComputeTFIDF(new TermDocumentMatrix(target)));
         case VSMWeightEnum.NoWeight:
             return SimilarityUtil.ComputeCosine(new TermDocumentMatrix(source, target), source.Keys, target.Keys);
         default:
             throw new NotImplementedException("Unknown weighting scheme \"" + weight + "\"");
     }
 }
        public void EmptyArtifactListTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();
            stopwords.Add("one");
            stopwords.Add("word");
            stopwords.Add("to");
            stopwords.Add("add");
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
        public void EmptyStopWordsListTest()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            artifacts.Add(new TLArtifact("id", "text 1"));
            artifacts.Add(new TLArtifact("id1", "text two"));
            artifacts.Add(new TLArtifact("id2", "text is three"));
            artifacts.Add(new TLArtifact("id3", "text has a the stop word"));
            Workspace.Store("listOfArtifacts", artifacts);

            TLStopwords stopwords = new TLStopwords();
            Workspace.Store("stopwords", stopwords);

            TestComponent.Compute();
        }
Esempio n. 55
0
        /// <summary>
        /// Constructs a term-by-document matrix from a TLArtifactsCollection
        /// </summary>
        /// <param name="artifacts">Artifacts collection</param>
        public TermDocumentMatrix(TLArtifactsCollection artifacts)
        {
            _termIndex = new List<string>();
            _docIndex = new List<string>();
            _termIndexLookup = new Dictionary<string, int>();
            _docIndexLookup = new Dictionary<string, int>();

            // create temporary corpus to build matrix with
            Dictionary<string, Dictionary<string, double>> corpus = new Dictionary<string, Dictionary<string, double>>();
            foreach (TLArtifact artifact in artifacts.Values)
            {
                // update document maps
                _docIndex.Add(artifact.Id);
                _docIndexLookup.Add(artifact.Id, _docIndex.Count - 1);
                corpus.Add(artifact.Id, new Dictionary<string, double>());
                foreach (string term in artifact.Text.Split())
                {
                    if (!String.IsNullOrWhiteSpace(term))
                    {
                        // update term maps
                        if (!_termIndexLookup.ContainsKey(term))
                        {
                            _termIndex.Add(term);
                            _termIndexLookup.Add(term, _termIndex.Count - 1);
                        }
                        // update document counts
                        if (corpus[artifact.Id].ContainsKey(term))
                        {
                            corpus[artifact.Id][term]++;
                        }
                        else
                        {
                            corpus[artifact.Id].Add(term, 1);
                        }
                    }
                }
            }

            // build term-by-document matrix
            _matrix = new double[_docIndex.Count][];
            for (int i = 0; i < _docIndex.Count; i++)
            {
                _matrix[i] = new double[_termIndex.Count];
                for (int j = 0; j < _termIndex.Count; j++)
                {
                    corpus[_docIndex[i]].TryGetValue(_termIndex[j], out _matrix[i][j]);
                }
            }
        }
Esempio n. 56
0
 /// <summary>
 /// Imports a SEMERU corpus in the form (each line):
 /// ID TEXT TEXT TEXT TEXT TEXT ...
 /// </summary>
 /// <param name="filename">Corpus file location</param>
 /// <returns>Artifacts collection</returns>
 public static TLArtifactsCollection Import(string filename)
 {
     StreamReader file = new StreamReader(filename);
     TLArtifactsCollection answer = new TLArtifactsCollection();
     String line;
     while ((line = file.ReadLine()) != null)
     {
         List<string> artifacts = new List<string>(line.Split());
         String id = artifacts[0].Trim();
         artifacts.RemoveAt(0);
         String doc = String.Join(" ", artifacts);
         answer.Add(new TLArtifact(id, doc));
     }
     return answer;
 }
Esempio n. 57
0
 public override void Compute()
 {
     Logger.Trace("Starting POSExtractor. This may take awhile (especially the bidirectional models)....");
     TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts");
     //TLArtifactsCollection extracted = POSTagger.Extract(artifacts, _config.POS, _config.ModelFile);
     TLArtifactsCollection extracted = new TLArtifactsCollection();
     int count = 1;
     foreach (KeyValuePair<string, TLArtifact> artifactKVP in artifacts)
     {
         extracted.Add(artifactKVP.Key, POSTagger.ExtractArtifact(artifactKVP.Value, _config.POS, _config.ModelFile));
         Logger.Trace("Extracted " + count + "/" + artifacts.Count);
         count++;
     }
     Workspace.Store("ListOfArtifacts", extracted);
 }
        private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords)
        {
            if (listOfArtifacts == null)
            {
                throw new ComponentException("Recieved null listofArtifacts");
            }

            if (stopwords == null)
            {
                throw new ComponentException("Recieved null stopwords");
            }

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                artifact.Text = PreprocessorStopWords.Process(artifact.Text, stopwords);
            }
        }
        private static void WriteArtifacts(TLArtifactsCollection artifactsCollection, System.Xml.XmlWriter writer)
        {
            writer.WriteStartElement("artifacts");

            foreach (KeyValuePair<string, TLArtifact> artifact in artifactsCollection)
            {
                writer.WriteStartElement("artifact");

                writer.WriteElementString("id", artifact.Value.Id.Trim());
                writer.WriteElementString("content", artifact.Value.Text.Trim());
                writer.WriteElementString("parent_id", String.Empty);

                writer.WriteEndElement();
            }

            writer.WriteEndElement(); // artifacts
        }
Esempio n. 60
0
        public void CleanArtifactsWithStopwords()
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();
            artifacts.Add(new TLArtifact("id1", "clean these words"));
            artifacts.Add(new TLArtifact("id2", "this has a stopword"));
            artifacts.Add(new TLArtifact("id3", "an expression"));

            TLStopwords stopwords = new TLStopwords();
            stopwords.Add("these");
            stopwords.Add("this");

            TLArtifactsCollection processedArtifacts = StopwordsRemover.ProcessArtifacts(artifacts, stopwords, 4, false);

            Assert.AreEqual(processedArtifacts["id1"].Text, "clean words");
            Assert.AreEqual(processedArtifacts["id2"].Text, "stopword");
            Assert.AreEqual(processedArtifacts["id3"].Text, "expression");
        }