Пример #1
0
        /// <summary>
        /// Reads the necessary data files and initializes the member variables.
        /// </summary>
        /// <param name="programWordCount">A dictionary containing the word counts from the local program.</param>
        private void Initialize(Dictionary <string, int> programWordCount)
        {
            this.CamelSplitter = new ConservativeIdSplitter();

            //set ProgramWordCount and calculate log of total
            this.ProgramWordCount = programWordCount;
            ulong ProgramTotalWordCount = 0;

            foreach (int value in this.ProgramWordCount.Values)
            {
                ProgramTotalWordCount = ProgramTotalWordCount + (ulong)value;
            }
            this.LogProgramTotalWordCount = Math.Log10(ProgramTotalWordCount);

            //load globalWordCount from default location
            var rawGlobalWordCount = LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.GlobalWordCountFile"), false, IncludeIdentifier);

            this.GlobalWordCount = new Dictionary <string, double>();
            //add weighting to word counts
            foreach (var kvp in rawGlobalWordCount)
            {
                this.GlobalWordCount[kvp.Key] = kvp.Value * Math.Pow((double)kvp.Key.Length - 1, 1.5);
            }

            //read prefix and suffix lists from default locations
            //TODO: the words must be in lowercase. Should we lowercase them on loading, or just assume/require that they're in lowercase in the file?
            this.Prefixes = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.Prefixesfile"));
            this.Suffixes = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.Suffixesfile"));
        }
Пример #2
0
        /// <summary>
        /// Sets the member data sets to their default states.
        /// </summary>
        private void InitializeMembers()
        {
            string specialWordsFile = SwumConfiguration.GetFileSetting("UnigramMethodRule.SpecialWordsFile");

            if (specialWordsFile != null)
            {
                this.SpecialWords = new HashSet <string>(LibFileLoader.ReadWordList(specialWordsFile), StringComparer.InvariantCultureIgnoreCase);
            }
            else
            {
                this.SpecialWords = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase);
                Console.Error.WriteLine("UnigramMethodRule.SpecialWordsFile not specified in config file.");
            }

            string booleanArgumentVerbsFile = SwumConfiguration.GetFileSetting("UnigramMethodRule.BooleanArgumentVerbsFile");

            if (booleanArgumentVerbsFile != null)
            {
                this.BooleanArgumentVerbs = new HashSet <string>(LibFileLoader.ReadWordList(booleanArgumentVerbsFile), StringComparer.InvariantCultureIgnoreCase);
            }
            else
            {
                this.BooleanArgumentVerbs = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase);
                Console.Error.WriteLine("UnigramMethodRule.BooleanArgumentVerbsFile not specified in config file.");
            }

            string nounPhraseIndicatorsFile = SwumConfiguration.GetFileSetting("UnigramMethodRule.NounPhraseIndicatorsFile");

            if (nounPhraseIndicatorsFile != null)
            {
                this.NounPhraseIndicators = new HashSet <string>(LibFileLoader.ReadWordList(nounPhraseIndicatorsFile), StringComparer.InvariantCultureIgnoreCase);
            }
            else
            {
                this.NounPhraseIndicators = new HashSet <string>(StringComparer.InvariantCultureIgnoreCase);
                Console.Error.WriteLine("UnigramMethodRule.NounPhraseIndicatorsFile not specified in config file.");
            }

            this.PositionalFrequencies = new PositionalFrequencies();
        }
Пример #3
0
 /// <summary>
 /// Creates a new PCKimmoPartOfSpeechData object using the default file locations for the part-of-speech data.
 /// </summary>
 public PCKimmoPartOfSpeechData()
 {
     TwoDict                   = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.TwoDictFile"));
     Prepositions              = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PrepositionsFile"));
     VerbsThirdPersonSingular  = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbsThirdPersonSingularFile"));
     VerbsThirdPersonIrregular = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbsThirdPersonIrregularFile"));
     ModalVerbs                = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.ModalVerbsFile"));
     IngVerbs                  = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IngVerbsFile"));
     PastTenseVerbs            = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastTenseVerbsRegularFile"), SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastTenseVerbsIrregularFile"));
     PastParticipleVerbs       = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastParticipleVerbsRegularFile"), SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastParticipleVerbsIrregularFile"));
     PotentialVerbs            = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PotentialVerbsFile"));
     OnlyNouns                 = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.OnlyNounsFile"));
     Adjectives                = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.AdjectivesFile"));
     Adverbs                   = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.AdverbsFile"));
     Determiners               = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.DeterminersFile"));
     Pronouns                  = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PronounsFile"));
     IgnorableVerbs            = new HashSet <string>(LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IgnorableVerbsFile")).Keys);
     IgnorableHeadWords        = new HashSet <string>(LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IgnorableHeadWordsFile")).Keys);
     GeneralVerbs              = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.GeneralVerbsFile"));
     EventWords                = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.EventWordsFile"));
     SideEffectWords           = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.SideEffectWordsFile"));
     VerbParticles             = LibFileLoader.ReadVerbParticleFile(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbParticlesFile"));
 }