예제 #1
0
 /// <summary>
 /// Processes a TLArtifactsCollection by stemming terms via the Porter stemming algorithm
 /// </summary>
 /// <param name="listOfArtifacts">Artifacts collection</param>
 /// <returns>Stemmed artifacts collection</returns>
 public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts)
 {
     TLArtifactsCollection processed = new TLArtifactsCollection();
     foreach (TLArtifact artifact in listOfArtifacts.Values)
     {
         TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
         processedArtifact.Text = ProcessText(artifact.Text);
         processed.Add(processedArtifact);
     }
     return processed;
 }
예제 #2
0
        /// <summary>
        /// Performs simple stopwords removal for each artifact in an artifacts collection.
        /// </summary>
        /// <param name="listOfArtifacts">Artifacts collection</param>
        /// <param name="minWordLength">Minimum word length</param>
        /// <param name="removeNumbers">Flag to remove numbers</param>
        /// <returns>Processed artifacts collection</returns>
        public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, int minWordLength, bool removeNumbers)
        {
            TLArtifactsCollection processed = new TLArtifactsCollection();

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
                processedArtifact.Text = ProcessText(artifact.Text, minWordLength, removeNumbers);
                processed.Add(processedArtifact);
            }
            return(processed);
        }
예제 #3
0
        /// <summary>
        /// Processes an artifacts collection, splitting CamelCase terms
        /// </summary>
        /// <param name="listOfArtifacts">Artifacts collection</param>
        /// <param name="convertToLowercase">Option to convert resulting terms to lowercase</param>
        /// <returns>Processed artifacts</returns>
        public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool convertToLowercase)
        {
            TLArtifactsCollection processed = new TLArtifactsCollection();

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
                processedArtifact.Text = ProcessText(artifact.Text, convertToLowercase);
                processed.Add(processedArtifact);
            }
            return(processed);
        }
예제 #4
0
        /// <summary>
        /// Processes an artifacts collection using the Snowball stemming algorithm.
        /// </summary>
        /// <param name="artifacts">Artifacts collection</param>
        /// <param name="langauge">Stemmer language</param>
        /// <returns>Stemmed artifacts</returns>
        public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection artifacts, SnowballStemmerEnum langauge)
        {
            TLArtifactsCollection processed = new TLArtifactsCollection();

            foreach (TLArtifact artifact in artifacts.Values)
            {
                TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
                processedArtifact.Text = ProcessText(artifact.Text, langauge);
                processed.Add(processedArtifact);
            }
            return(processed);
        }
예제 #5
0
        /// <summary>
        /// Splits identifiers for each artifact in an artifacts collection
        /// </summary>
        /// <param name="listOfArtifacts"></param>
        /// <param name="keepCompoundIdentifier"></param>
        /// <returns>Splitted artifacts</returns>
        public static TLArtifactsCollection ProcessArtifacts(TLArtifactsCollection listOfArtifacts, bool keepCompoundIdentifier)
        {
            TLArtifactsCollection processed = new TLArtifactsCollection();

            foreach (TLArtifact artifact in listOfArtifacts.Values)
            {
                TLArtifact processedArtifact = new TLArtifact(artifact.Id, String.Empty);
                processedArtifact.Text = ProcessText(artifact.Text, keepCompoundIdentifier);
                processed.Add(processedArtifact);
            }
            return(processed);
        }
예제 #6
0
        /// <summary>
        /// Reads artifacts from given XML file
        /// </summary>
        public static TLArtifactsCollection ReadXMLFile(string filepath, bool trimValues)
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            XPathDocument  doc = new XPathDocument(filepath);
            XPathNavigator nav = doc.CreateNavigator();

            string art_id, art_text, art_content;

            XPathNodeIterator nodeItor = nav.Select("/artifacts/artifact");

            while (nodeItor.MoveNext())
            {
                // Only reading xml tags: art_id, art_title, art_content
                art_id      = ReadSingleItem(filepath, nodeItor.Current, "art_id");
                art_text    = ReadSingleItem(filepath, nodeItor.Current, "art_title");
                art_content = ReadSingleItem(filepath, nodeItor.Current, "art_content");

                if (trimValues)
                {
                    art_id      = art_id.Trim();
                    art_text    = art_text.Trim();
                    art_content = art_content.Trim();
                }

                art_text = art_text + " " + art_content;

                // Checking if ID is already in Artifacts List
                if (!artifacts.ContainsKey(art_id))
                {
                    TLArtifact artifact = new TLArtifact(art_id, art_text);
                    artifacts.Add(art_id, artifact);
                }
                else
                {
                    PoirotFormatArtifactsReader.Logger.Warn(
                        String.Format("Repeated artifact ID '{0}' found in file '{1}'.", art_id, filepath)
                        );
                }
            }

            return(artifacts);
        }
예제 #7
0
        public static TLArtifactsCollection ImportArtifacts(string filepath, bool trimValues)
        {
            TLArtifactsCollection artifacts = new TLArtifactsCollection();

            XPathDocument  doc = new XPathDocument(filepath);
            XPathNavigator nav = doc.CreateNavigator();

            //read collection info
            artifacts.CollectionId          = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/id");
            artifacts.CollectionName        = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/name");
            artifacts.CollectionVersion     = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/version");
            artifacts.CollectionDescription = ReadSingleNode(filepath, nav, "/artifacts_collection/collection_info/description");

            if (trimValues)
            {
                artifacts.CollectionId          = artifacts.CollectionId.Trim();
                artifacts.CollectionName        = artifacts.CollectionName.Trim();
                artifacts.CollectionVersion     = artifacts.CollectionVersion.Trim();
                artifacts.CollectionDescription = artifacts.CollectionDescription.Trim();
            }

            //check what type of content location the file has
            XPathNavigator iter = nav.SelectSingleNode("/artifacts_collection/collection_info/content_location");
            string         content_location_type = "internal"; //default content location is internal

            //if content location has been sprecified read it
            if (iter != null)
            {
                content_location_type = iter.Value;
            }

            //root dir is going to be needed to external content type, to determine absolute paths of the files
            string rootDir = System.IO.Path.GetDirectoryName(filepath);

            XPathNodeIterator artifactsIterator = nav.Select("/artifacts_collection/artifacts/artifact");

            string artifactId;
            string content;

            while (artifactsIterator.MoveNext())
            {
                iter       = artifactsIterator.Current.SelectSingleNode("id");
                artifactId = iter.InnerXml;

                iter = artifactsIterator.Current.SelectSingleNode("content");

                if (content_location_type.Equals("external"))
                {
                    content = System.IO.File.ReadAllText(System.IO.Path.Combine(rootDir, iter.InnerXml.Trim()));
                }
                else
                {
                    content = iter.InnerXml;
                }

                if (trimValues)
                {
                    artifactId = artifactId.Trim();
                    content    = content.Trim();
                }

                // Checking if ID is already in Artifacts List
                if (!artifacts.ContainsKey(artifactId))
                {
                    TLArtifact artifact = new TLArtifact(artifactId, content);
                    artifacts.Add(artifactId, artifact);
                }
                else
                {
                    CoestDatasetImporterHelper.Logger.Warn(
                        String.Format("Repeated artifact ID '{0}' found in file '{1}'.", artifactId, filepath)
                        );
                }

                //artifacts.Add(artifactId, (new TLArtifact(artifactId, content)));
            }

            return(artifacts);
        }
예제 #8
0
 /// <summary>
 /// Extracts and returns all terms with the specified POS from a single TLArtifact.
 /// </summary>
 /// <param name="artifact">Single artifact</param>
 /// <param name="pos">Part of speech to extract</param>
 /// <param name="modelFile">Training model file location</param>
 /// <returns>Single artifact consisting of only the terms with the specified POS</returns>
 public static TLArtifact ExtractArtifact(TLArtifact artifact, POSTaggerSpeechType pos, string modelFile)
 {
     return(new TLArtifact(artifact.Id, ExtractPOS(Tag(artifact.Text, modelFile), pos)));
 }