public void RarePhrasesTest() { TLArtifactsCollection collection = new TLArtifactsCollection(); collection.Add(new TLArtifact("blank", "")); collection.Add(new TLArtifact("space", " ")); collection.Add(new TLArtifact("accents", "à ì")); // à = 133 ì = 141 collection.Add(new TLArtifact("unrecognized", "╣")); // the debugger sees this value as a square, ascii is 441 Workspace.Store("listOfArtifacts", collection); TestComponent.Compute(); collection = (TLArtifactsCollection) Workspace.Load("listOfArtifacts"); if (collection["blank"].Text != "") { Assert.Fail("blank got '" + collection["blank"].Text + "' when '' was expected"); } if (collection["space"].Text != "") { Assert.Fail("space got '" + collection["space"].Text + "' when '' was expected"); } if (collection["accents"].Text != "") { Assert.Fail("accents got '" + collection["accents"].Text + "' when 'à ì' was expected"); } if (collection["unrecognized"].Text != "") { Assert.Fail("unrecognized got '" + collection["unrecognized"].Text + "' when '╣' was expected"); } }
/// <summary> /// Constructor /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="config">Configuration object</param> public GibbsLDAScript(TLArtifactsCollection source, TLArtifactsCollection target, GibbsLDAConfig config) : base() { _source = source; _target = target; _config = config; }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLArtifactsCollection removed = SimpleStopwordsRemover.ProcessArtifacts(listOfArtifacts, _config.MinWordLength, _config.RemoveNumbers); Workspace.Store("listOfArtifacts", removed); }
public override void Compute() { TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts"); TLArtifactsCollection processed = CamelCaseSplitter.ProcessArtifacts(artifacts, _config.ConvertLowercase); Workspace.Store("ListOfArtifacts", processed); }
public static void Export(TLArtifactsCollection artifactsCollection, string outputPath, string collectionId, string name, string version, string description) { if (artifactsCollection == null) { throw new TraceLabSDK.ComponentException("Received null artifacts collection."); } System.Xml.XmlWriterSettings settings = new System.Xml.XmlWriterSettings(); settings.Indent = true; settings.CloseOutput = true; settings.CheckCharacters = true; //create file using (System.Xml.XmlWriter writer = System.Xml.XmlWriter.Create(outputPath, settings)) { writer.WriteStartDocument(); writer.WriteStartElement("artifacts_collection"); WriteCollectionInfo(writer, collectionId, name, version, description); WriteArtifacts(artifactsCollection, writer); writer.WriteEndElement(); //artifacts_collection writer.WriteEndDocument(); writer.Close(); } System.Diagnostics.Trace.WriteLine("File created , you can find the file " + outputPath); }
public void TestTracingOfComponent() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); TLDictionaryIndex dictionary = new TLDictionaryIndex(); // TODO: add inputs that matter sourceArtifacts.Add(new TLArtifact("id1", "first text")); sourceArtifacts.Add(new TLArtifact("id2", "words to do stuff with")); sourceArtifacts.Add(new TLArtifact("id3", "some more text")); targetArtifacts.Add(new TLArtifact("id1", "hello world")); targetArtifacts.Add(new TLArtifact("id2", "very very random yes indeed")); targetArtifacts.Add(new TLArtifact("id3", "yep")); targetArtifacts.Add(new TLArtifact("id4", "chickens in the coop")); dictionary.AddTermEntry("term", 3, 3, 0.2); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); // TODO: add tests to make sure the output is correctly formatted Assert.Fail(); }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLArtifactsCollection stemmed = PorterStemmerUtils.ProcessArtifacts(listOfArtifacts); Workspace.Store("listOfArtifacts", stemmed); }
public override void Compute() { TLArtifactsCollection artifacts = Importers.Corpus.Import(_config.Identifiers.Absolute, _config.Documents.Absolute); Workspace.Store("ListOfArtifacts", artifacts); Workspace.Store("NumberOfDocuments", artifacts.Count); }
public void CleanArtifactsWithStopwords() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id1", "clean these words")); artifacts.Add(new TLArtifact("id2", "this has a stopword")); artifacts.Add(new TLArtifact("id3", "expression")); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); stopwords.Add("these"); stopwords.Add("has"); stopwords.Add("an"); stopwords.Add("a"); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); artifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); stopwords = (TLStopwords)Workspace.Load("stopwords"); Assert.AreEqual(artifacts["id1"].Text, "clean words"); Assert.AreEqual(artifacts["id2"].Text, "this stopword"); Assert.AreEqual(artifacts["id3"].Text, "expression"); }
public override void Compute() { TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts"); TLArtifactsCollection processed = SemeruSplitter.ProcessArtifacts(artifacts, _config.KeepCompoundIdentifier); Workspace.Store("ListOfArtifacts", processed); }
private static TLSimilarityMatrix Process(TLArtifactsCollection sourceArtifacts, TLDictionaryIndex dict, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received null sourceArtifacts"); } if (dict == null) { throw new ComponentException("Received null dictionaryIndex"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); Searcher searcher = new Searcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List <Result> results; results = searcher.search(query, dict); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return(similarityMatrix); }
public static TLArtifactsCollection Import(String idPath, String docPath) { TLArtifactsCollection artifacts = new TLArtifactsCollection(); StreamReader idFile = new StreamReader(idPath); StreamReader docFile = new StreamReader(docPath); String origid; String doc; while ((origid = idFile.ReadLine()) != null) { // read doc doc = docFile.ReadLine().Trim(); // set vars String id = origid.Trim(); int num = 0; while (artifacts.ContainsKey(id)) { num++; id = origid.Trim() + "_" + num.ToString(); } artifacts.Add(new TLArtifact(id, doc)); } idFile.Close(); return(artifacts); }
private static void CreateCSVReport(TLArtifactsCollection artifacts, string outputPath) { if (artifacts == null) { throw new ComponentException("Received artifacts collection is null!"); } if (outputPath == null) { throw new ComponentException("Output path cannot be null."); } if (!System.IO.Path.IsPathRooted(outputPath)) { throw new ComponentException(String.Format("Absolute output path is required. Given path is '{0}'", outputPath)); } if (outputPath.EndsWith(".csv", StringComparison.CurrentCultureIgnoreCase) == false) { outputPath = outputPath + ".csv"; } using (System.IO.TextWriter writeFile = new StreamWriter(outputPath)) { WriteArtifactsToFile(artifacts, writeFile); writeFile.Flush(); writeFile.Close(); } }
public void RarePhrasesTest() { TLArtifactsCollection collection = new TLArtifactsCollection(); collection.Add(new TLArtifact("blank", "")); collection.Add(new TLArtifact("space", " ")); collection.Add(new TLArtifact("accents", "à ì")); // à = 133 ì = 141 collection.Add(new TLArtifact("unrecognized", "╣")); // the debugger sees this value as a square ascii is 441 Workspace.Store("listOfArtifacts", collection); TestComponent.Compute(); collection = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); if (collection["blank"].Text != "") { Assert.Fail("blank got '" + collection["blank"].Text + "' when '' was expected"); } if (collection["space"].Text != "") { Assert.Fail("space got '" + collection["space"].Text + "' when '' was expected"); } if (collection["accents"].Text != "à ì") { Assert.Fail("accents got '" + collection["accents"].Text + "' when 'à ì' was expected"); } if (collection["unrecognized"].Text != "╣") { Assert.Fail("unrecognized got '" + collection["unrecognized"].Text + "' when '╣' was expected"); } }
/// <summary> /// Constructor /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="config">Configuration object</param> public LDAScript(TLArtifactsCollection source, TLArtifactsCollection target, LDAConfig config) : base() { _source = new TermDocumentMatrix(source); _target = new TermDocumentMatrix(target); _config = config; }
public void EmptyDictionaryIndexTest() { TLArtifactsCollection sourceArtifacts = new TLArtifactsCollection(); sourceArtifacts.Add(new TLArtifact("id", "text")); TLArtifactsCollection targetArtifacts = new TLArtifactsCollection(); targetArtifacts.Add(new TLArtifact("id", "text")); TLDictionaryIndex dictionary = new TLDictionaryIndex(); Workspace.Store("sourceArtifacts", sourceArtifacts); Workspace.Store("targetArtifacts", targetArtifacts); Workspace.Store("dictionaryIndex", dictionary); ((TracerConfig)TestComponent.Configuration).SimilarityMetric = SimilarityMetricMethod.SimpleMatching; TestComponent.Compute(); TLSimilarityMatrix simMat = (TLSimilarityMatrix)Workspace.Load("similarityMatrix"); if (simMat == null || simMat.Count != 0) { Assert.Fail("Similarity Matrix should still be created but have nothing in it"); } }
public void EmptyCollectionTest() { TLArtifactsCollection collection = new TLArtifactsCollection(); Workspace.Store("listOfArtifacts", collection); TestComponent.Compute(); }
public void CleanListOfArtifacts() { TLArtifactsCollection collection = new TLArtifactsCollection(); collection.Add(new TLArtifact("id", "addition")); collection.Add(new TLArtifact("id2", "works all arounds")); collection.Add(new TLArtifact("id3", "the world is nothing but a huge sphere")); Workspace.Store("listOfArtifacts", collection); TestComponent.Compute(); collection = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); if (collection["id"].Text != "addit") { Assert.Fail("id got '" + collection["id"].Text + "' when 'addit' was expected"); } if (collection["id2"].Text != "work all around") { Assert.Fail("id2 got '" + collection["id2"].Text + "' when 'work all around' was expected"); } if (collection["id3"].Text != "the world is noth but a huge sphere") { Assert.Fail("id3 got '" + collection["id3"].Text + "' when 'the world is noth but a huge sphere' was expected"); } }
public override void Compute() { TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("sourceArtifacts"); TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("targetArtifacts"); if (sourceArtifacts == null) { throw new ArgumentException("Source artifacts cannot be null."); } if (targetArtifacts == null) { throw new ArgumentException("Target artifacts cannot be null."); } string error; //do validation if (CoestDatasetImporterHelper.ValidatePath(m_config.FilePath, "Answer Set File", out error)) { var answerMatrix = CoestDatasetImporterHelper.ImportAnswerSet(m_config.FilePath, sourceArtifacts, targetArtifacts, Logger, m_config.TrimElementValues); Workspace.Store("answerMatrix", answerMatrix); Logger.Trace(String.Format("Answer matrix imported from {0}.", m_config.FilePath)); } else { throw new ArgumentException(error); } }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLArtifactsCollection cleanArtifacts = Cleanup.ProcessArtifacts(listOfArtifacts, _config.ConvertLowercase); Workspace.Store("listOfArtifacts", cleanArtifacts); }
public static TLArtifactsCollection Import(String idPath, String docPath) { TLArtifactsCollection artifacts = new TLArtifactsCollection(); StreamReader idFile = new StreamReader(idPath); StreamReader docFile = new StreamReader(docPath); String origid; String doc; while ((origid = idFile.ReadLine()) != null) { // read doc doc = docFile.ReadLine().Trim(); // set vars String id = origid.Trim(); int num = 0; while (artifacts.ContainsKey(id)) { num++; id = origid.Trim() + "_" + num.ToString(); } artifacts.Add(new TLArtifact(id, doc)); } idFile.Close(); return artifacts; }
public void CleanListOfArtifacts() { TLArtifactsCollection collection = new TLArtifactsCollection(); collection.Add(new TLArtifact("id", "this is text")); collection.Add(new TLArtifact("id2", "this is text")); collection.Add(new TLArtifact("id3", "this is more text")); Workspace.Store("listOfArtifacts", collection); TestComponent.Compute(); collection = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); if (collection["id"].Text != "this is text") { Assert.Fail("id got '" + collection["id"].Text + "' when 'this is text' was expected"); } if (collection["id2"].Text != "this is text") { Assert.Fail("id2 got '" + collection["id2"].Text + "' when 'this is text' was expected"); } if (collection["id3"].Text != "this is more text") { Assert.Fail("id3 got '" + collection["id3"].Text + "' when 'this is more text' was expected"); } }
public override void Compute() { TLArtifactsCollection artifactsCollection = (TLArtifactsCollection)Workspace.Load("artifactsCollection"); string path, collectionName, id; path = (string)Workspace.Load("outputPath"); if (path.EndsWith(".xml", StringComparison.CurrentCultureIgnoreCase)) { collectionName = path.Remove(path.LastIndexOf(".") + 1); } else { collectionName = path; path = path + ".xml"; } collectionName = collectionName.Substring(collectionName.LastIndexOf('\\') + 1); id = collectionName; ArtifactsCollectionExporterUtilities.Export(artifactsCollection, path, id, collectionName, "1.1", collectionName); Workspace.Store("collectionId", id); Logger.Info(String.Format("Artifacts Collection has been saved into xml file '{0}'", path)); }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); ProcessArtifacts(listOfArtifacts, _config); Workspace.Store("listOfArtifacts", listOfArtifacts); }
public override void Compute() { TLArtifactsCollection artifacts = Importers.Corpus.Import(_config.Identifiers.Absolute, _config.Documents.Absolute); List <string> map = Importers.Corpus.Map(_config.Identifiers.Absolute); Workspace.Store("ListOfArtifacts", artifacts); Workspace.Store("DocumentMap", map); }
public override void Compute() { TLArtifactsCollection sourceArtifacts = (TLArtifactsCollection)Workspace.Load("SourceArtifacts"); TLArtifactsCollection targetArtifacts = (TLArtifactsCollection)Workspace.Load("TargetArtifacts"); TLSimilarityMatrix sims = VSM.Compute(sourceArtifacts, targetArtifacts, _config.WeightingScheme); Workspace.Store("Similarities", sims); }
public override void Compute() { TLArtifactsCollection artifactsCollection = (TLArtifactsCollection)Workspace.Load("artifactsCollection"); ArtifactsCollectionExporterUtilities.Export(artifactsCollection, Config.Path, Config.CollectionId, Config.CollectionName, Config.CollectionVersion, Config.CollectionDescription); Logger.Info(String.Format("Artifacts Collection has been saved into xml file '{0}'", Config.Path.Absolute)); }
public override void Compute() { TLSimilarityMatrix sims = (TLSimilarityMatrix)Workspace.Load("SimilarityMatrix"); TLSimilarityMatrix gold = (TLSimilarityMatrix)Workspace.Load("GoldSet"); TLArtifactsCollection queries = (TLArtifactsCollection)Workspace.Load("Queries"); Effectiveness.Export(queries, sims, gold, _config.AllMethodsFile, _config.BestMethodsFile); }
public override void Compute() { TLArtifactsCollection artifactsCollection = (TLArtifactsCollection)Workspace.Load("artifactsCollection"); CreateCSVReport(artifactsCollection, Config.Path.Absolute); Logger.Info(String.Format("Artifacts Collection has been saved into csv file '{0}'", Config.Path.Absolute)); }
public override void Compute() { TLArtifactsCollection source = (TLArtifactsCollection)Workspace.Load("SourceArtifacts"); TLArtifactsCollection target = (TLArtifactsCollection)Workspace.Load("TargetArtifacts"); TLSimilarityMatrix sims = JSD.Compute(source, target); Workspace.Store("Similarities", sims); }
public override void Compute() { TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts"); Models.Vectorizer vec = new Models.Vectorizer(artifacts, _config.Representation); Workspace.Store("DocumentVectors", vec.Vectors); Workspace.Store("DocumentFrequencies", vec.Frequencies); }
public override void Compute() { TLArtifactsCollection listOfArtifacts = (TLArtifactsCollection)Workspace.Load("listOfArtifacts"); TLDictionaryIndex dict = BuildDictionary(listOfArtifacts, Logger); Workspace.Store("dictionaryIndex", dict); }
/// <summary> /// Extracts and returns all terms with the specified POS from a TLArtifactsCollection. /// </summary> /// <param name="artifacts">List of artifacts</param> /// <param name="pos">Part of speech to extract</param> /// <param name="modelFile">Training model file location</param> /// <returns>TLArtifactsCollection consisting of only the terms with the specified POS</returns> public static TLArtifactsCollection Extract(TLArtifactsCollection artifacts, POSTaggerSpeechType pos, string modelFile) { TLArtifactsCollection extracted = new TLArtifactsCollection(); foreach (KeyValuePair<string, TLArtifact> artifactKVP in artifacts) { extracted.Add(artifactKVP.Key, ExtractArtifact(artifactKVP.Value, pos, modelFile)); } return extracted; }
/// <summary> /// Constructor /// </summary> /// <param name="name">Corpus name</param> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> public LDACorpus(string name, TLArtifactsCollection source, TLArtifactsCollection target) { Name = name; TermDocumentMatrix sMatrix = new TermDocumentMatrix(source); TermDocumentMatrix tMatrix = new TermDocumentMatrix(target); _sourceDocs = sMatrix.DocMap; _targetDocs = tMatrix.DocMap; _matrix = TermDocumentMatrix.Combine(sMatrix, tMatrix); }
public override void Compute() { TLArtifactsCollection source = (TLArtifactsCollection)Workspace.Load("SourceArtifacts"); TLArtifactsCollection target = (TLArtifactsCollection)Workspace.Load("TargetArtifacts"); REngine engine = new REngine(_config.RScriptPath); GibbsLDAConfig config = (GibbsLDAConfig)engine.Execute(new GibbsLDA_GAScript(source, target, _config)); Workspace.Store("GibbsLDAConfig", config); }
public override void Compute() { TLArtifactsCollection source = (TLArtifactsCollection)Workspace.Load("SourceArtifacts"); TLArtifactsCollection target = (TLArtifactsCollection)Workspace.Load("TargetArtifacts"); REngine engine = new REngine(_config.RScriptPath); TLSimilarityMatrix sims = (TLSimilarityMatrix)engine.Execute(new RTMScript(source, target, _config)); Workspace.Store("Similarities", sims); }
/// <summary> /// Exports a corpus in the form (each line): /// ID TEXT TEXT TEXT TEXT TEXT ... /// </summary> /// <param name="artifacts">Artifacts collection</param> /// <param name="outputfile">Output file path</param> public static void ExportFile(TLArtifactsCollection artifacts, string outputfile) { TextWriter tw = new StreamWriter(outputfile); foreach (TLArtifact artifact in artifacts.Values) { tw.WriteLine(artifact.Id + " " + artifact.Text.Replace("\n", " ").Replace("\r", String.Empty)); } tw.Flush(); tw.Close(); }
private static object PrepareMockData() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); int i = 0; foreach (string text in new string[] { "artifact text 1", "artifact text 2", "artifact text 3" }) { artifacts.Add(new TLArtifact(i.ToString(), text)); i++; } return artifacts; }
private static TLDictionaryIndex BuildDictionary(TLArtifactsCollection listOfArtifacts, ComponentLogger logger) { if (listOfArtifacts == null) { throw new ComponentException("Received null listOfArtifacts"); } TLDictionaryIndex dict = TFIDFIndexBuilder.build(listOfArtifacts, logger); return dict; }
private static void WriteArtifactsToFile(TLArtifactsCollection artifacts, System.IO.TextWriter writeFile) { //header writeFile.WriteLine("Id,Text"); foreach (TLArtifact artifact in artifacts.Values) { writeFile.WriteLine("\"{0}\",\"{1}\"", artifact.Id, artifact.Text); } }
/// <summary> /// Processes an artifacts collection using the Snowball stemming algorithm. /// </summary> /// <param name="artifacts">Artifacts collection</param> /// <param name="langauge">Stemmer language</param> /// <returns>Stemmed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection artifacts, SnowballStemmerEnum langauge) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in artifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, langauge); processed.Add(processedArtifact); } return processed; }
/// <summary> /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <param name="stopwords">Stopwords collection</param> /// <param name="minWordLength">Minimum word length</param> /// <param name="removeNumbers">Flag to remove numbers</param> /// <returns>Processed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers); processed.Add(processedArtifact); } return processed; }
/// <summary> /// Splits identifiers for each artifact in an artifacts collection /// </summary> /// <param name="listOfArtifacts"></param> /// <param name="keepCompoundIdentifier"></param> /// <returns>Splitted artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool keepCompoundIdentifier) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, keepCompoundIdentifier); processed.Add(processedArtifact); } return processed; }
/// <summary> /// Processes an artifacts collection, splitting CamelCase terms /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <param name="convertToLowercase">Option to convert resulting terms to lowercase</param> /// <returns>Processed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool convertToLowercase) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, convertToLowercase); processed.Add(processedArtifact); } return processed; }
public Vectorizer(TLArtifactsCollection artifacts, String representation) { vectors = new DocumentVectorCollection(); freq = new DocumentVector("DocumentFrequencies"); foreach (KeyValuePair<string, TLArtifact> kvp in artifacts) { // vars String docID = kvp.Value.Id; String[] words = kvp.Value.Text.Split(' '); // create new document representation DocumentVector vec = new DocumentVector(docID); List<String> addedWords = new List<String>(); // loop over each word and update its frequency foreach (String word in words) { // update term-doc frequency only ONCE per document if (!freq.ContainsKey(word)) { freq.Add(word, 1); addedWords.Add(word); } else if (!addedWords.Contains(word)) { freq[word]++; addedWords.Add(word); } // update word freqency if (!vec.ContainsKey(word)) { vec.Add(word, 1); } else { if (representation == "Ordinal") { vec[word]++; } } // update MaxFreq if (vec[word] > vec.MaxFreq.Value) { vec.MaxFreq = new KeyValuePair<string, int>(word, vec[word]); } } // add document to vector collection vectors.Add(vec); } }
private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, PreprocessorCleanUpComponentConfig config) { if (listOfArtifacts == null) { throw new ComponentException("Received null listofArtifacts"); } foreach (TLArtifact artifact in listOfArtifacts.Values) { artifact.Text = PreprocessorCleanUp.Process(artifact.Text, config.RemoveDigits); } }
private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts) { if (listOfArtifacts == null) { throw new ComponentException("Recieved Null listofArtifacts"); } foreach (TLArtifact artifact in listOfArtifacts.Values) { artifact.Text = PreprocessorStemmer.Process(artifact.Text); } }
/// <summary> /// Computes the traceability between source and target artifacts using dictionary and American Corpus Term weigths. /// </summary> /// <param name="sourceArtifacts">The source artifacts.</param> /// <param name="targetArtifacts">The target artifacts.</param> /// <param name="dict">The dict.</param> /// <param name="ancTermsWeights">The anc terms weights.</param> /// <param name="config">The config.</param> /// <returns>Similarity matrix with links between source and target artifacts</returns> private static TLSimilarityMatrix ComputeTraceability(TLArtifactsCollection sourceArtifacts, TLArtifactsCollection targetArtifacts, TLDictionaryIndex dict, TLKeyValuePairsList ancTermsWeights, TracerConfig config) { if (sourceArtifacts == null) { throw new ComponentException("Received source artifacts are null!"); } if (targetArtifacts == null) { throw new ComponentException("Received target artifacts are null!"); } if (dict == null) { throw new ComponentException("Received dictionary index is null!"); } if (ancTermsWeights == null) { throw new ComponentException("Received 'ancTermsWeights' is null!"); } TLSimilarityMatrix similarityMatrix = new TLSimilarityMatrix(); ANCSearcher searcher = new ANCSearcher(SimilarityMetricFactory.GetSimiliarityMetric(config.SimilarityMetric)); // Iterates over all the source artifacts to determine the probabilities to target artifacts - by executing a search foreach (TLArtifact sourceArtifact in sourceArtifacts.Values) { String query = sourceArtifact.Text; // Executes the query List<Result> results; results = searcher.search(query, dict, PrepareANCData(ancTermsWeights)); // Iterates over the results and stores them in the matrix foreach (Result r in results) { string targetArtifactId = r.ArtifactId; similarityMatrix.AddLink(sourceArtifact.Id, targetArtifactId, r.Ranking); } } return similarityMatrix; }
/// <summary> /// Computes cosine similarities between documents via the Vector Space Model. /// </summary> /// <param name="source">Source artifacts</param> /// <param name="target">Target artifacts</param> /// <param name="weight">Weighting scheme</param> /// <returns>Similarity matrix</returns> public static TLSimilarityMatrix Compute(TLArtifactsCollection source, TLArtifactsCollection target, VSMWeightEnum weight) { switch (weight) { case VSMWeightEnum.TFIDF: return SimilarityUtil.ComputeCosine(WeightUtil.ComputeTFIDF(new TermDocumentMatrix(source, target)), source.Keys, target.Keys); case VSMWeightEnum.BooleanQueriesAndTFIDFCorpus: return SimilarityUtil.ComputeCosine(WeightUtil.ComputeBinaryTF(new TermDocumentMatrix(source)), WeightUtil.ComputeTFIDF(new TermDocumentMatrix(target))); case VSMWeightEnum.NoWeight: return SimilarityUtil.ComputeCosine(new TermDocumentMatrix(source, target), source.Keys, target.Keys); default: throw new NotImplementedException("Unknown weighting scheme \"" + weight + "\""); } }
public void EmptyArtifactListTest() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); stopwords.Add("one"); stopwords.Add("word"); stopwords.Add("to"); stopwords.Add("add"); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); }
public void EmptyStopWordsListTest() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id", "text 1")); artifacts.Add(new TLArtifact("id1", "text two")); artifacts.Add(new TLArtifact("id2", "text is three")); artifacts.Add(new TLArtifact("id3", "text has a the stop word")); Workspace.Store("listOfArtifacts", artifacts); TLStopwords stopwords = new TLStopwords(); Workspace.Store("stopwords", stopwords); TestComponent.Compute(); }
/// <summary> /// Constructs a term-by-document matrix from a TLArtifactsCollection /// </summary> /// <param name="artifacts">Artifacts collection</param> public TermDocumentMatrix(TLArtifactsCollection artifacts) { _termIndex = new List<string>(); _docIndex = new List<string>(); _termIndexLookup = new Dictionary<string, int>(); _docIndexLookup = new Dictionary<string, int>(); // create temporary corpus to build matrix with Dictionary<string, Dictionary<string, double>> corpus = new Dictionary<string, Dictionary<string, double>>(); foreach (TLArtifact artifact in artifacts.Values) { // update document maps _docIndex.Add(artifact.Id); _docIndexLookup.Add(artifact.Id, _docIndex.Count - 1); corpus.Add(artifact.Id, new Dictionary<string, double>()); foreach (string term in artifact.Text.Split()) { if (!String.IsNullOrWhiteSpace(term)) { // update term maps if (!_termIndexLookup.ContainsKey(term)) { _termIndex.Add(term); _termIndexLookup.Add(term, _termIndex.Count - 1); } // update document counts if (corpus[artifact.Id].ContainsKey(term)) { corpus[artifact.Id][term]++; } else { corpus[artifact.Id].Add(term, 1); } } } } // build term-by-document matrix _matrix = new double[_docIndex.Count][]; for (int i = 0; i < _docIndex.Count; i++) { _matrix[i] = new double[_termIndex.Count]; for (int j = 0; j < _termIndex.Count; j++) { corpus[_docIndex[i]].TryGetValue(_termIndex[j], out _matrix[i][j]); } } }
/// <summary> /// Imports a SEMERU corpus in the form (each line): /// ID TEXT TEXT TEXT TEXT TEXT ... /// </summary> /// <param name="filename">Corpus file location</param> /// <returns>Artifacts collection</returns> public static TLArtifactsCollection Import(string filename) { StreamReader file = new StreamReader(filename); TLArtifactsCollection answer = new TLArtifactsCollection(); String line; while ((line = file.ReadLine()) != null) { List<string> artifacts = new List<string>(line.Split()); String id = artifacts[0].Trim(); artifacts.RemoveAt(0); String doc = String.Join(" ", artifacts); answer.Add(new TLArtifact(id, doc)); } return answer; }
public override void Compute() { Logger.Trace("Starting POSExtractor. This may take awhile (especially the bidirectional models)...."); TLArtifactsCollection artifacts = (TLArtifactsCollection)Workspace.Load("ListOfArtifacts"); //TLArtifactsCollection extracted = POSTagger.Extract(artifacts, _config.POS, _config.ModelFile); TLArtifactsCollection extracted = new TLArtifactsCollection(); int count = 1; foreach (KeyValuePair<string, TLArtifact> artifactKVP in artifacts) { extracted.Add(artifactKVP.Key, POSTagger.ExtractArtifact(artifactKVP.Value, _config.POS, _config.ModelFile)); Logger.Trace("Extracted " + count + "/" + artifacts.Count); count++; } Workspace.Store("ListOfArtifacts", extracted); }
private static void ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords) { if (listOfArtifacts == null) { throw new ComponentException("Recieved null listofArtifacts"); } if (stopwords == null) { throw new ComponentException("Recieved null stopwords"); } foreach (TLArtifact artifact in listOfArtifacts.Values) { artifact.Text = PreprocessorStopWords.Process(artifact.Text, stopwords); } }
private static void WriteArtifacts(TLArtifactsCollection artifactsCollection, System.Xml.XmlWriter writer) { writer.WriteStartElement("artifacts"); foreach (KeyValuePair<string, TLArtifact> artifact in artifactsCollection) { writer.WriteStartElement("artifact"); writer.WriteElementString("id", artifact.Value.Id.Trim()); writer.WriteElementString("content", artifact.Value.Text.Trim()); writer.WriteElementString("parent_id", String.Empty); writer.WriteEndElement(); } writer.WriteEndElement(); // artifacts }
public void CleanArtifactsWithStopwords() { TLArtifactsCollection artifacts = new TLArtifactsCollection(); artifacts.Add(new TLArtifact("id1", "clean these words")); artifacts.Add(new TLArtifact("id2", "this has a stopword")); artifacts.Add(new TLArtifact("id3", "an expression")); TLStopwords stopwords = new TLStopwords(); stopwords.Add("these"); stopwords.Add("this"); TLArtifactsCollection processedArtifacts = StopwordsRemover.ProcessArtifacts(artifacts, stopwords, 4, false); Assert.AreEqual(processedArtifacts["id1"].Text, "clean words"); Assert.AreEqual(processedArtifacts["id2"].Text, "stopword"); Assert.AreEqual(processedArtifacts["id3"].Text, "expression"); }