/// <summary> /// Reads the necessary data files and initializes the member variables. /// </summary> /// <param name="programWordCount">A dictionary containing the word counts from the local program.</param> private void Initialize(Dictionary <string, int> programWordCount) { this.CamelSplitter = new ConservativeIdSplitter(); //set ProgramWordCount and calculate log of total this.ProgramWordCount = programWordCount; ulong ProgramTotalWordCount = 0; foreach (int value in this.ProgramWordCount.Values) { ProgramTotalWordCount = ProgramTotalWordCount + (ulong)value; } this.LogProgramTotalWordCount = Math.Log10(ProgramTotalWordCount); //load globalWordCount from default location var rawGlobalWordCount = LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.GlobalWordCountFile"), false, IncludeIdentifier); this.GlobalWordCount = new Dictionary <string, double>(); //add weighting to word counts foreach (var kvp in rawGlobalWordCount) { this.GlobalWordCount[kvp.Key] = kvp.Value * Math.Pow((double)kvp.Key.Length - 1, 1.5); } //read prefix and suffix lists from default locations //TODO: the words must be in lowercase. Should we lowercase them on loading, or just assume/require that they're in lowercase in the file? this.Prefixes = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.Prefixesfile")); this.Suffixes = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("SamuraiIdSplitter.Suffixesfile")); }
/// <summary> /// Creates a new PCKimmoPartOfSpeechData object using the default file locations for the part-of-speech data. /// </summary> public PCKimmoPartOfSpeechData() { TwoDict = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.TwoDictFile")); Prepositions = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PrepositionsFile")); VerbsThirdPersonSingular = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbsThirdPersonSingularFile")); VerbsThirdPersonIrregular = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbsThirdPersonIrregularFile")); ModalVerbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.ModalVerbsFile")); IngVerbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IngVerbsFile")); PastTenseVerbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastTenseVerbsRegularFile"), SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastTenseVerbsIrregularFile")); PastParticipleVerbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastParticipleVerbsRegularFile"), SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PastParticipleVerbsIrregularFile")); PotentialVerbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PotentialVerbsFile")); OnlyNouns = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.OnlyNounsFile")); Adjectives = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.AdjectivesFile")); Adverbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.AdverbsFile")); Determiners = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.DeterminersFile")); Pronouns = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.PronounsFile")); IgnorableVerbs = new HashSet <string>(LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IgnorableVerbsFile")).Keys); IgnorableHeadWords = new HashSet <string>(LibFileLoader.ReadWordCount(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.IgnorableHeadWordsFile")).Keys); GeneralVerbs = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.GeneralVerbsFile")); EventWords = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.EventWordsFile")); SideEffectWords = LibFileLoader.ReadWordList(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.SideEffectWordsFile")); VerbParticles = LibFileLoader.ReadVerbParticleFile(SwumConfiguration.GetFileSetting("PCKimmoPartOfSpeechData.VerbParticlesFile")); }
///// <summary> ///// Creates a new IdentifierSplitter using the default location for program word count data. ///// </summary> //public SamuraiIdSplitter() //{ // //use default location for programWordCount // //call Initialize() //} /// <summary> /// Creates a new IdentifierSplitter using the specified program word count file. /// </summary> /// <param name="programWordCountPath">The path to the file containing the local program word counts.</param> public SamuraiIdSplitter(string programWordCountPath) { Initialize(LibFileLoader.ReadWordCount(programWordCountPath, false, IncludeIdentifier)); }