Beispiel #1
0
        /// <summary>
        /// Reads the necessary data files and initializes the member variables.
        /// </summary>
        /// <param name="programWordCount">A dictionary containing the word counts from the local program.</param>
        private void Initialize(Dictionary <string, int> programWordCount)
        {
            this.CamelSplitter = new ConservativeIdSplitter();

            //set ProgramWordCount and calculate log of total
            this.ProgramWordCount = programWordCount;
            ulong ProgramTotalWordCount = 0;

            foreach (int value in this.ProgramWordCount.Values)
            {
                ProgramTotalWordCount = ProgramTotalWordCount + (ulong)value;
            }
            this.LogProgramTotalWordCount = Math.Log10(ProgramTotalWordCount);

            //load globalWordCount from default location
            var rawGlobalWordCount = LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.GlobalWordCountFile"), false, IncludeIdentifier);

            this.GlobalWordCount = new Dictionary <string, double>();
            //add weighting to word counts
            foreach (var kvp in rawGlobalWordCount)
            {
                this.GlobalWordCount[kvp.Key] = kvp.Value * Math.Pow((double)kvp.Key.Length - 1, 1.5);
            }

            //read prefix and suffix lists from default locations
            //TODO: the words must be in lowercase. Should we lowercase them on loading, or just assume/require that they're in lowercase in the file?
            this.Prefixes = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.Prefixesfile"));
            this.Suffixes = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.Suffixesfile"));
        }
Beispiel #2
0
        /// <summary>
        /// Sets the member data sets to their default states.
        /// </summary>
        private void InitializeMembers()
        {
            string specialWordsFile = SwumConfiguration.GetFileSetting("UnigramMethodRule.SpecialWordsFile");

            if (specialWordsFile != null)
            {
                this.SpecialWords = new HashSet <string>(LibFileLoader.ReadWordList(specialWordsFile), StringComparer.InvariantCultureIgnoreCase);
            }
            else
            {
                this.SpecialWords = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase);
                Console.Error.WriteLine("UnigramMethodRule.SpecialWordsFile not specified in config file.");
            }

            string booleanArgumentVerbsFile = SwumConfiguration.GetFileSetting("UnigramMethodRule.BooleanArgumentVerbsFile");

            if (booleanArgumentVerbsFile != null)
            {
                this.BooleanArgumentVerbs = new HashSet <string>(LibFileLoader.ReadWordList(booleanArgumentVerbsFile), StringComparer.InvariantCultureIgnoreCase);
            }
            else
            {
                this.BooleanArgumentVerbs = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase);
                Console.Error.WriteLine("UnigramMethodRule.BooleanArgumentVerbsFile not specified in config file.");
            }

            string nounPhraseIndicatorsFile = SwumConfiguration.GetFileSetting("UnigramMethodRule.NounPhraseIndicatorsFile");

            if (nounPhraseIndicatorsFile != null)
            {
                this.NounPhraseIndicators = new HashSet <string>(LibFileLoader.ReadWordList(nounPhraseIndicatorsFile), StringComparer.InvariantCultureIgnoreCase);
            }
            else
            {
                this.NounPhraseIndicators = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase);
                Console.Error.WriteLine("UnigramMethodRule.NounPhraseIndicatorsFile not specified in config file.");
            }

            this.PositionalFrequencies = new PositionalFrequencies();
        }
Beispiel #3
0
 /// <summary>
 /// Creates a new PCKimmoPartOfSpeechData object using the default file locations for the part-of-speech data.
 /// </summary>
 public PCKimmoPartOfSpeechData()
 {
     TwoDict                   = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.TwoDictFile"));
     Prepositions              = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PrepositionsFile"));
     VerbsThirdPersonSingular  = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbsThirdPersonSingularFile"));
     VerbsThirdPersonIrregular = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbsThirdPersonIrregularFile"));
     ModalVerbs                = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.ModalVerbsFile"));
     IngVerbs                  = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IngVerbsFile"));
     PastTenseVerbs            = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastTenseVerbsRegularFile"), SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastTenseVerbsIrregularFile"));
     PastParticipleVerbs       = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastParticipleVerbsRegularFile"), SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastParticipleVerbsIrregularFile"));
     PotentialVerbs            = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PotentialVerbsFile"));
     OnlyNouns                 = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.OnlyNounsFile"));
     Adjectives                = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.AdjectivesFile"));
     Adverbs                   = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.AdverbsFile"));
     Determiners               = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.DeterminersFile"));
     Pronouns                  = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PronounsFile"));
     IgnorableVerbs            = new HashSet <string>(LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IgnorableVerbsFile")).Keys);
     IgnorableHeadWords        = new HashSet <string>(LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IgnorableHeadWordsFile")).Keys);
     GeneralVerbs              = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.GeneralVerbsFile"));
     EventWords                = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.EventWordsFile"));
     SideEffectWords           = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.SideEffectWordsFile"));
     VerbParticles             = LibFileLoader.ReadVerbParticleFile(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbParticlesFile"));
 }
Beispiel #4
0
 /// <summary>
 /// Creates a new FileStemmer using the file specified.
 /// </summary>
 /// <param name="stemFilePath">The path to the file containing the word/stem pairs to use.</param>
 public FileStemmer(string stemFilePath)
 {
     Stems = LibFileLoader.ReadStemFile(stemFilePath);
 }
Beispiel #5
0
        ///// <summary>
        ///// Creates a new IdentifierSplitter using the default location for program word count data.
        ///// </summary>
        //public SamuraiIdSplitter()
        //{
        //    //use default location for programWordCount
        //    //call Initialize()
        //}

        /// <summary>
        /// Creates a new IdentifierSplitter using the specified program word count file.
        /// </summary>
        /// <param name="programWordCountPath">The path to the file containing the local program word counts.</param>
        public SamuraiIdSplitter(string programWordCountPath)
        {
            Initialize(LibFileLoader.ReadWordCount(programWordCountPath, false, IncludeIdentifier));
        }