/// <summary> /// Processes a TLArtifactsCollection by stemming terms via the Porter stemming algorithm /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <returns>Stemmed artifacts collection</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text); processed.Add(processedArtifact); } return processed; }
/// <summary> /// Performs simple stopwords removal for each artifact in an artifacts collection. /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <param name="minWordLength">Minimum word length</param> /// <param name="removeNumbers">Flag to remove numbers</param> /// <returns>Processed artifacts collection</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, int minWordLength, bool removeNumbers) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, minWordLength, removeNumbers); processed.Add(processedArtifact); } return(processed); }
/// <summary> /// Processes an artifacts collection, splitting CamelCase terms /// </summary> /// <param name="listOfArtifacts">Artifacts collection</param> /// <param name="convertToLowercase">Option to convert resulting terms to lowercase</param> /// <returns>Processed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool convertToLowercase) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, convertToLowercase); processed.Add(processedArtifact); } return(processed); }
/// <summary> /// Processes an artifacts collection using the Snowball stemming algorithm. /// </summary> /// <param name="artifacts">Artifacts collection</param> /// <param name="langauge">Stemmer language</param> /// <returns>Stemmed artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection artifacts, SnowballStemmerEnum langauge) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in artifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, langauge); processed.Add(processedArtifact); } return(processed); }
/// <summary> /// Splits identifiers for each artifact in an artifacts collection /// </summary> /// <param name="listOfArtifacts"></param> /// <param name="keepCompoundIdentifier"></param> /// <returns>Splitted artifacts</returns> public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool keepCompoundIdentifier) { TLArtifactsCollection processed = new TLArtifactsCollection(); foreach (TLArtifact artifact in listOfArtifacts.Values) { TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty); processedArtifact.Text = ProcessText(artifact.Text, keepCompoundIdentifier); processed.Add(processedArtifact); } return(processed); }
/// <summary> /// Reads artifacts from given XML file /// </summary> public static TLArtifactsCollection ReadXMLFile(string filepath, bool trimValues) { TLArtifactsCollection artifacts = new TLArtifactsCollection(); XPathDocument doc = new XPathDocument(filepath); XPathNavigator nav = doc.CreateNavigator(); string art_id, art_text, art_content; XPathNodeIterator nodeItor = nav.Select("/artifacts/artifact"); while (nodeItor.MoveNext()) { // Only reading xml tags: art_id, art_title, art_content art_id = ReadSingleItem(filepath, nodeItor.Current, "art_id"); art_text = ReadSingleItem(filepath, nodeItor.Current, "art_title"); art_content = ReadSingleItem(filepath, nodeItor.Current, "art_content"); if (trimValues) { art_id = art_id.Trim(); art_text = art_text.Trim(); art_content = art_content.Trim(); } art_text = art_text + " " + art_content; // Checking if ID is already in Artifacts List if (!artifacts.ContainsKey(art_id)) { TLArtifact artifact = new TLArtifact(art_id, art_text); artifacts.Add(art_id, artifact); } else { PoirotFormatArtifactsReader.Logger.Warn( String.Format("Repeated artifact ID '{0}' found in file '{1}'.", art_id, filepath) ); } } return(artifacts); }
public static TLArtifactsCollection ImportArtifacts(string filepath, bool trimValues) { TLArtifactsCollection artifacts = new TLArtifactsCollection(); XPathDocument doc = new XPathDocument(filepath); XPathNavigator nav = doc.CreateNavigator(); //read collection info artifacts.CollectionId = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/id"); artifacts.CollectionName = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/name"); artifacts.CollectionVersion = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/version"); artifacts.CollectionDescription = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/description"); if (trimValues) { artifacts.CollectionId = artifacts.CollectionId.Trim(); artifacts.CollectionName = artifacts.CollectionName.Trim(); artifacts.CollectionVersion = artifacts.CollectionVersion.Trim(); artifacts.CollectionDescription = artifacts.CollectionDescription.Trim(); } //check what type of content location the file has XPathNavigator iter = nav.SelectSingleNode("/artifacts_collection/collection_info/content_location"); string content_location_type = "internal"; //default content location is internal //if content location has been sprecified read it if (iter != null) { content_location_type = iter.Value; } //root dir is going to be needed to external content type, to determine absolute paths of the files string rootDir = System.IO.Path.GetDirectoryName(filepath); XPathNodeIterator artifactsIterator = nav.Select("/artifacts_collection/artifacts/artifact"); string artifactId; string content; while (artifactsIterator.MoveNext()) { iter = artifactsIterator.Current.SelectSingleNode("id"); artifactId = iter.InnerXml; iter = artifactsIterator.Current.SelectSingleNode("content"); if (content_location_type.Equals("external")) { content = System.IO.File.ReadAllText(System.IO.Path.Combine(rootDir, iter.InnerXml.Trim())); } else { content = iter.InnerXml; } if (trimValues) { artifactId = artifactId.Trim(); content = content.Trim(); } // Checking if ID is already in Artifacts List if (!artifacts.ContainsKey(artifactId)) { TLArtifact artifact = new TLArtifact(artifactId, content); artifacts.Add(artifactId, artifact); } else { CoestDatasetImporterHelper.Logger.Warn( String.Format("Repeated artifact ID '{0}' found in file '{1}'.", artifactId, filepath) ); } //artifacts.Add(artifactId, (new TLArtifact(artifactId, content))); } return(artifacts); }
/// <summary> /// Extracts and returns all terms with the specified POS from a single TLArtifact. /// </summary> /// <param name="artifact">Single artifact</param> /// <param name="pos">Part of speech to extract</param> /// <param name="modelFile">Training model file location</param> /// <returns>Single artifact consisting of only the terms with the specified POS</returns> public static TLArtifact ExtractArtifact(TLArtifact artifact, POSTaggerSpeechType pos, string modelFile) { return(new TLArtifact(artifact.Id, ExtractPOS(Tag(artifact.Text, modelFile), pos))); }