Example #1
0
        /// <summary>
        /// Determines whether the specified Object is equal to the current Object.
        /// </summary>
        /// <param name="obj">the other object</param>
        /// <returns>true if objects are equal</returns>
        public override bool Equals(object obj)
        {
            TLArtifact other = obj as TLArtifact;

            if (other != null)
            {
                return(Id.Equals(other.Id) && Text.Equals(other.Text));
            }
            return(false);
        }
Example #2
0
 /// <summary>
 /// Processes an artifacts collection, splitting CamelCase terms
 /// </summary>
 /// <param name="listOfArtifacts">Artifacts collection</param>
 /// <param name="convertToLowercase">Option to convert resulting terms to lowercase</param>
 /// <returns>Processed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool convertToLowercase)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, convertToLowercase);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Example #3
0
 /// <summary>
 /// Processes an artifacts collection using the Snowball stemming algorithm.
 /// </summary>
 /// <param name="artifacts">Artifacts collection</param>
 /// <param name="langauge">Stemmer language</param>
 /// <returns>Stemmed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection artifacts, SnowballStemmerEnum langauge)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in artifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, langauge);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Example #4
0
 /// <summary>
 /// Processes a TLArtifactsCollection by removing terms that match the given list of stopwords
 /// </summary>
 /// <param name="listOfArtifacts">Artifacts collection</param>
 /// <param name="stopwords">Stopwords collection</param>
 /// <param name="minWordLength">Minimum word length</param>
 /// <param name="removeNumbers">Flag to remove numbers</param>
 /// <returns>Processed artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, TLStopwords stopwords, int minWordLength, bool removeNumbers)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, stopwords, minWordLength, removeNumbers);
         processed.Add(processedArtifact);
     }
     return processed;
 }
Example #5
0
 /// <summary>
 /// Splits identifiers for each artifact in an artifacts collection
 /// </summary>
 /// <param name="listOfArtifacts"></param>
 /// <param name="keepCompoundIdentifier"></param>
 /// <returns>Splitted artifacts</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool keepCompoundIdentifier)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text, keepCompoundIdentifier);
         processed.Add(processedArtifact);
     }
     return processed;
 }
        /// <summary>
        /// Reads artifacts from given XML file
        /// </summary>
        public static TLArtifactsCollection ReadXMLFile(string filepath, bool trimValues)
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            XPathDocument doc = new XPathDocument(filepath);
            XPathNavigator nav = doc.CreateNavigator();
            
            string art_id, art_text, art_content;

            XPathNodeIterator nodeItor = nav.Select("/artifacts/artifact");

            while (nodeItor.MoveNext())
            {
                // Only reading xml tags: art_id, art_title, art_content
                art_id = ReadSingleItem(filepath, nodeItor.Current, "art_id");
                art_text = ReadSingleItem(filepath, nodeItor.Current, "art_title");
                art_content = ReadSingleItem(filepath, nodeItor.Current, "art_content");

                if (trimValues)
                {
                    art_id = art_id.Trim();
                    art_text = art_text.Trim();
                    art_content = art_content.Trim();
                }

                art_text = art_text + " " + art_content;
                
                // Checking if ID is already in Artifacts List
                if (!artifacts.ContainsKey(art_id))
                {
                    TLArtifact artifact = new TLArtifact(art_id, art_text);
                    artifacts.Add(art_id, artifact);
                }
                else
                {
                    PoirotFormatArtifactsReader.Logger.Warn(
                        String.Format("Repeated artifact ID '{0}' found in file '{1}'.", art_id, filepath)
                        );
                }
            }

            return artifacts;
        }
Example #7
0
 /// <summary>
 /// Imports artifacts from an XML file in standard CoEST format.
 /// </summary>
 /// <param name="filepath">Input file path</param>
 /// <param name="trimValues">Trim whitespace from entries?</param>
 /// <returns>Artifacts collection</returns>
 public static TLArtifactsCollection ImportXMLFile(string filepath, bool trimValues)
 {
     TLArtifactsCollection artifacts = new TLArtifactsCollection();
     XPathDocument doc = new XPathDocument(filepath);
     XPathNavigator nav = doc.CreateNavigator();
     //read collection info
     artifacts.CollectionId = ReadSingleXMLNode(filepath, nav, "/artifacts_collection/collection_info/id");
     artifacts.CollectionName = ReadSingleXMLNode(filepath, nav, "/artifacts_collection/collection_info/name");
     artifacts.CollectionVersion = ReadSingleXMLNode(filepath, nav, "/artifacts_collection/collection_info/version");
     artifacts.CollectionDescription = ReadSingleXMLNode(filepath, nav, "/artifacts_collection/collection_info/description");
     if (trimValues)
     {
         artifacts.CollectionId = artifacts.CollectionId.Trim();
         artifacts.CollectionName = artifacts.CollectionName.Trim();
         artifacts.CollectionVersion = artifacts.CollectionVersion.Trim();
         artifacts.CollectionDescription = artifacts.CollectionDescription.Trim();
     }
     //check what type of content location the file has
     XPathNavigator iter = nav.SelectSingleNode("/artifacts_collection/collection_info/content_location");
     string content_location_type = "internal"; //default content location is internal
     //if content location has been sprecified read it
     if (iter != null)
     {
         content_location_type = iter.Value;
     }
     //root dir is going to be needed to external content type, to determine absolute paths of the files
     string rootDir = System.IO.Path.GetDirectoryName(filepath);
     XPathNodeIterator artifactsIterator = nav.Select("/artifacts_collection/artifacts/artifact");
     string artifactId;
     string content;
     while (artifactsIterator.MoveNext())
     {
         iter = artifactsIterator.Current.SelectSingleNode("id");
         artifactId = iter.InnerXml;
         iter = artifactsIterator.Current.SelectSingleNode("content");
         if (content_location_type.Equals("external"))
         {
             content = System.IO.File.ReadAllText(System.IO.Path.Combine(rootDir, iter.InnerXml.Trim()));
         }
         else
         {
             content = iter.InnerXml;
         }
         if (trimValues)
         {
             artifactId = artifactId.Trim();
             content = content.Trim();
         }
         // Checking if ID is already in Artifacts List
         if (!artifacts.ContainsKey(artifactId))
         {
             TLArtifact artifact = new TLArtifact(artifactId, content);
             artifacts.Add(artifactId, artifact);
         }
         else
         {
             /*
              CoestDatasetImporterHelper.Logger.Warn(
                 String.Format("Repeated artifact ID '{0}' found in file '{1}'.", artifactId, filepath)
              );
             */
         }
     }
     return artifacts;
 }
 /// <summary>
 /// Extracts and returns all terms with the specified POS from a single TLArtifact.
 /// </summary>
 /// <param name="artifact">Single artifact</param>
 /// <param name="pos">Part of speech to extract</param>
 /// <param name="modelFile">Training model file location</param>
 /// <returns>Single artifact consisting of only the terms with the specified POS</returns>
 public static TLArtifact ExtractArtifact(TLArtifact artifact, POSTaggerSpeechType pos, string modelFile)
 {
     return new TLArtifact(artifact.Id, ExtractPOS(Tag(artifact.Text, modelFile), pos));
 }