public void Read(DataSet ds, string tableName, out Dictionary<string, Suffix> suffixesById, out MorphemeSurfaceDictionary<Suffix> suffixes) { EnumerableRowCollection<DataRow> data = ds.Tables[tableName].AsEnumerable(); EnumerableRowCollection<SuffixDictionaryLine> entries = data.Select(x => new SuffixDictionaryLine { Id = x.Field<string>("id"), Lex = x.Field<string>("lexicalForm"), Type = x.Field<string>("type"), Flags = x.Field<string>("flags") ?? "", Rules = x.Field<string>("rules") ?? "", Surfaces = x.Field<string>("surfaces"), }); suffixesById = new Dictionary<string, Suffix>(); suffixes = new MorphemeSurfaceDictionary<Suffix>(); foreach (SuffixDictionaryLine entry in entries) { AddSuffix(entry, suffixesById, suffixes); } }
public Language Read() { var sw = new Stopwatch(); sw.Start(); _orthography = ReadOrthography(); Debug.Print($"orthograpy: {sw.ElapsedMilliseconds} ms"); sw.Restart(); Morphotactics morphotactics = ReadMorphotactics(); Debug.Print($"morphotactics: {sw.ElapsedMilliseconds} ms"); sw.Restart(); MorphemeSurfaceDictionary <Root> roots = ReadRoots(); Debug.Print($"roots: {sw.ElapsedMilliseconds} ms"); sw.Restart(); Suffixes suffixes = ReadSuffixes(); Debug.Print($"suffixes: {sw.ElapsedMilliseconds} ms"); sw.Restart(); int index = _dirPath.LastIndexOf("\\", StringComparison.Ordinal); string langCode = index > -1 ? _dirPath.Substring(index + 1) : _dirPath; return(new Language(langCode, morphotactics, roots, suffixes)); }
public MorphemeContainer <Suffix> Read(DataSet ds, string tableName ) { EnumerableRowCollection <DataRow> data = ds.Tables[tableName].AsEnumerable(); EnumerableRowCollection <SuffixDictionaryLine> entries = data.Select(x => new SuffixDictionaryLine { Id = x.Field <string>("id")?.Trim(), Lex = x.Field <string>("lexicalForm")?.Trim(), Type = x.Field <string>("type")?.Trim(), Labels = x.Field <string>("flags")?.Trim() ?? "", Rules = x.Field <string>("rules")?.Trim() ?? "", Surfaces = x.Field <string>("surfaces")?.Trim(), }); var suffixesById = new Dictionary <string, Suffix>(); var suffixesBySurface = new MorphemeSurfaceDictionary <Suffix>(); foreach (SuffixDictionaryLine entry in entries) { AddSuffix(entry, suffixesById, suffixesBySurface); } return(new MorphemeContainer <Suffix>(suffixesById, suffixesBySurface)); }
public Suffixes( Dictionary<string, Suffix> suffixesById, MorphemeSurfaceDictionary<Suffix> suffixesBySurface) { SuffixesById = suffixesById; SuffixesBySurface = suffixesBySurface; }
public MorphemeContainer<Suffix> Read(DataSet ds, string tableName ) { EnumerableRowCollection<DataRow> data = ds.Tables[tableName].AsEnumerable(); EnumerableRowCollection<SuffixDictionaryLine> entries = data.Select(x => new SuffixDictionaryLine { Id = x.Field<string>("id"), Lex = x.Field<string>("lexicalForm"), Type = x.Field<string>("type"), Labels = x.Field<string>("flags") ?? "", Rules = x.Field<string>("rules") ?? "", Surfaces = x.Field<string>("surfaces"), }); var suffixesById = new Dictionary<string, Suffix>(); var suffixesBySurface = new MorphemeSurfaceDictionary<Suffix>(); foreach (SuffixDictionaryLine entry in entries) { AddSuffix(entry, suffixesById, suffixesBySurface); } return new MorphemeContainer<Suffix>(suffixesById, suffixesBySurface); }
private void AddSuffix(SuffixDictionaryLine entry, Dictionary<string, Suffix> suffixesById, MorphemeSurfaceDictionary<Suffix> suffixes) { string id = entry.Id; string lex = entry.Lex; MorphemeType morphemeType; if (!Enum.TryParse(entry.Type, out morphemeType)) { morphemeType = MorphemeType.O; Console.WriteLine("Invalid Morpheme Type: " + entry.Type); } string[] flags = entry.Flags.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); string[] rulesToken = entry.Rules.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); Debug.Assert(entry.Surfaces != null, "entry.Surfaces != null"); var surfaces = new List<string>(entry.Surfaces.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries)); List<OrthographyRule> rules = _orthography.GetRules(rulesToken); var suffix = new Suffix(id, lex, morphemeType, LabelSet.ConvertLabelNamesToIndexes(flags), rules); suffixesById.Add(id, suffix); foreach (string surface in surfaces) { suffixes.Add(surface, suffix); } }
private void AddRoots(RootLine entry, MorphemeSurfaceDictionary<Root> roots) { string item = entry.Root; string[] surfaces = entry.Surfaces.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); string lex = entry.Lex; string[] flags = entry.Flags.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); string type = entry.Id; string[] rules = entry.Rules.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); if (string.IsNullOrEmpty(entry.Lex)) { lex = item; } Root root; if (type == "KISALTMA" || type == "ALINTI" || type == "KISALTMA_NOKTALI" || type == "HARF") { root = new Root(type, lex, LabelSet.ConvertLabelNamesToIndexes(flags), _orthography.GetRules(rules), item); } else { root = new Root(type, lex, LabelSet.ConvertLabelNamesToIndexes(flags), _orthography.GetRules(rules)); } roots.Add(item, root); // kelimeyi asıl yüzeyi ile ekliyoruz //eğer fazladan yüzeyi var ise onunla da ekliyoruz. foreach (string lexicalForm in surfaces) { roots.Add(lexicalForm, root); } }
private void AddSuffix(SuffixDictionaryLine entry, Dictionary <string, Suffix> suffixesById, MorphemeSurfaceDictionary <Suffix> suffixes) { string id = entry.Id; string lex = entry.Lex; MorphemeType morphemeType; if (!Enum.TryParse(entry.Type, out morphemeType)) { morphemeType = MorphemeType.O; Console.WriteLine("Invalid Morpheme Type: " + entry.Type); } string[] flags = entry.Flags.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); string[] rulesToken = entry.Rules.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); Debug.Assert(entry.Surfaces != null, "entry.Surfaces != null"); var surfaces = new List <string>(entry.Surfaces.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); List <OrthographyRule> rules = _orthography.GetRules(rulesToken); var suffix = new Suffix(id, lex, morphemeType, LabelSet.ConvertLabelNamesToIndexes(flags), rules); suffixesById.Add(id, suffix); foreach (string surface in surfaces) { suffixes.Add(surface, suffix); } }
public void Read(DataSet ds, string tableName, out Dictionary <string, Suffix> suffixesById, out MorphemeSurfaceDictionary <Suffix> suffixes) { EnumerableRowCollection <DataRow> data = ds.Tables[tableName].AsEnumerable(); EnumerableRowCollection <SuffixDictionaryLine> entries = data.Select(x => new SuffixDictionaryLine { Id = x.Field <string>("id"), Lex = x.Field <string>("lexicalForm"), Type = x.Field <string>("type"), Flags = x.Field <string>("flags") ?? "", Rules = x.Field <string>("rules") ?? "", Surfaces = x.Field <string>("surfaces"), }); suffixesById = new Dictionary <string, Suffix>(); suffixes = new MorphemeSurfaceDictionary <Suffix>(); foreach (SuffixDictionaryLine entry in entries) { AddSuffix(entry, suffixesById, suffixes); } }
public void AddEntries(DataSet ds, string tableName, Dictionary <string, Root> rootsById, MorphemeSurfaceDictionary <Root> rootsBySurface) { var data = ds.Tables[tableName].AsEnumerable(); var entries = data.Select(x => new RootLine { Root = x.Field <string>("root").Trim(), Surfaces = x.Field <string>("surfaces")?.Trim() ?? "", Lex = x.Field <string>("lex").Trim(), Active = x.Field <string>("active")?.Trim() ?? "", Pos = x.Field <string>("Id")?.Trim(), Labels = x.Field <string>("flags")?.Trim() ?? "", Rules = x.Field <string>("rules")?.Trim() ?? "" }); foreach (var entry in entries) { if (entry.Active == "") { AddRoots(entry, rootsById, rootsBySurface); } } }
internal Language(string code, Morphotactics morphotactics, MorphemeSurfaceDictionary<Root> roots, Suffixes suffixes) { _code = code; Morphotactics = morphotactics; _roots = roots; _suffixes = suffixes; }
internal Language(string code, Morphotactics morphotactics, MorphemeSurfaceDictionary <Root> roots, Suffixes suffixes) { _code = code; Morphotactics = morphotactics; _roots = roots; _suffixes = suffixes; }
private void AddRoots(RootLine entry, Dictionary <string, Root> rootsById, MorphemeSurfaceDictionary <Root> rootsBySurface) { var mainSurface = entry.Root; var surfaces = new List <string> { mainSurface }; surfaces.AddRange(entry.Surfaces.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); var lex = entry.Lex; var labels = entry.Labels.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); var pos = entry.Pos; var rules = entry.Rules.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); if (Regex.IsMatch(lex, @"\p{L}[2-9]")) { rules.Add("DROP_ID_DIGIT"); } if (string.IsNullOrEmpty(entry.Lex)) { lex = mainSurface; } var root = new Root(pos, lex, new ImmutableSortedSet <string>(surfaces), new ImmutableHashSet <string>(labels), _orthography.GetRules(rules)); var id = lex + "/" + pos; if (!rootsById.ContainsKey(id)) { rootsById.Add(id, root); } else { Trace.TraceEvent(TraceEventType.Warning, 0, $"Duplicate root: {id}"); } foreach (var lexicalForm in surfaces) { rootsBySurface.Add(lexicalForm, root); } }
private void AddRoots(RootLine entry, Dictionary<string, Root> rootsById, MorphemeSurfaceDictionary<Root> rootsBySurface) { var mainSurface = entry.Root; var surfaces = new List<string> {mainSurface}; surfaces.AddRange(entry.Surfaces.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); var lex = entry.Lex; var labels = entry.Labels.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); var pos = entry.Pos; var rules = entry.Rules.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries).ToList(); if (Regex.IsMatch(lex, @"\p{L}[2-9]")) { rules.Add("DROP_ID_DIGIT"); } if (string.IsNullOrEmpty(entry.Lex)) { lex = mainSurface; } var root = new Root(pos, lex, new ImmutableSortedSet<string>(surfaces), new ImmutableHashSet<string>(labels), _orthography.GetRules(rules)); var id = lex + "/" + pos; if (!rootsById.ContainsKey(id)) { rootsById.Add(id, root); } else { Trace.TraceEvent(TraceEventType.Warning, 0, $"Duplicate root: {id}"); } foreach (var lexicalForm in surfaces) { rootsBySurface.Add(lexicalForm, root); } }
private MorphemeContainer <Root> ParseRoots(string dataTxt) { try { var rootsBySurface = new MorphemeSurfaceDictionary <Root>(); var rootsById = new Dictionary <string, Root>(); var reader = new RootLexiconReader(_orthography); using (var stream = GenerateStreamFromString(dataTxt)) { var ds = TextToDataSet.Convert(stream, DefaultTableName, Delimiter); reader.AddEntries(ds, DefaultTableName, rootsById, rootsBySurface); } return(new MorphemeContainer <Root>(rootsById, rootsBySurface)); } catch (Exception ex) { throw new InvalidLanguageFileException(ex, Type.Roots, "Invalid language file for roots: " + _languageType); } }
private MorphemeSurfaceDictionary <Root> ReadRoots() { try { var roots = new MorphemeSurfaceDictionary <Root>(); var reader = new RootLexiconReader(_orthography); string rootsPath = _dirPath + _seperator + Resources.InternalMainRootsPath; if (_external) { using (var stream = new FileStream(rootsPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { reader.AddEntries(TextToDataSet.Convert(stream, DefaultTableName, Delimiter), DefaultTableName, roots); } return(roots); } reader.AddEntries(EmbeddedTextResourceToDataSet(rootsPath), DefaultTableName, roots); string namesPath = _dirPath + _seperator + Resources.InternalPersonNamesPath; reader.AddEntries(EmbeddedTextResourceToDataSet(namesPath), DefaultTableName, roots); string abbreviationPath = _dirPath + _seperator + Resources.InternalAbbreviationsPath; reader.AddEntries(EmbeddedTextResourceToDataSet(abbreviationPath), DefaultTableName, roots); return(roots); } catch (Exception ex) { throw new InvalidLanguageFileException(ex, Type.Roots, "Invalid language file for roots: "); } }
private void AddRoots(RootLine entry, MorphemeSurfaceDictionary <Root> roots) { string item = entry.Root; string[] surfaces = entry.Surfaces.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); string lex = entry.Lex; string[] flags = entry.Flags.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); string type = entry.Id; string[] rules = entry.Rules.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); if (string.IsNullOrEmpty(entry.Lex)) { lex = item; } Root root; if (type == "KISALTMA" || type == "ALINTI" || type == "KISALTMA_NOKTALI" || type == "HARF") { root = new Root(type, lex, LabelSet.ConvertLabelNamesToIndexes(flags), _orthography.GetRules(rules), item); } else { root = new Root(type, lex, LabelSet.ConvertLabelNamesToIndexes(flags), _orthography.GetRules(rules)); } roots.Add(item, root); // kelimeyi asıl yüzeyi ile ekliyoruz //eğer fazladan yüzeyi var ise onunla da ekliyoruz. foreach (string lexicalForm in surfaces) { roots.Add(lexicalForm, root); } }
public void AddEntries(DataSet ds, string tableName, MorphemeSurfaceDictionary<Root> roots) { EnumerableRowCollection<DataRow> data = ds.Tables[tableName].AsEnumerable(); EnumerableRowCollection<RootLine> entries = data.Select(x => new RootLine { Root = x.Field<string>("root"), Surfaces = x.Field<string>("surfaces") ?? "", Lex = x.Field<string>("lex"), Active = x.Field<string>("active") ?? "", Id = x.Field<string>("Id"), Flags = x.Field<string>("flags") ?? "", Rules = x.Field<string>("rules") ?? "", }); foreach (RootLine entry in entries) { if (entry.Active == "") { AddRoots(entry, roots); } } }
public void AddEntries(DataSet ds, string tableName, MorphemeSurfaceDictionary <Root> roots) { EnumerableRowCollection <DataRow> data = ds.Tables[tableName].AsEnumerable(); EnumerableRowCollection <RootLine> entries = data.Select(x => new RootLine { Root = x.Field <string>("root"), Surfaces = x.Field <string>("surfaces") ?? "", Lex = x.Field <string>("lex"), Active = x.Field <string>("active") ?? "", Id = x.Field <string>("Id"), Flags = x.Field <string>("flags") ?? "", Rules = x.Field <string>("rules") ?? "", }); foreach (RootLine entry in entries) { if (entry.Active == "") { AddRoots(entry, roots); } } }
private void AddSuffix(SuffixDictionaryLine entry, Dictionary <string, Suffix> suffixesById, MorphemeSurfaceDictionary <Suffix> suffixes) { string id = entry.Id; string lex = entry.Lex; MorphemeType morphemeType; if (!Enum.TryParse(entry.Type, out morphemeType)) { morphemeType = MorphemeType.O; Trace.TraceEvent(TraceEventType.Error, 0, $"Invalid Morpheme Type: {entry.Type}"); } string[] labels = entry.Labels.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); string[] rulesToken = entry.Rules.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries); Debug.Assert(entry.Surfaces != null, "entry.Surfaces != null"); var surfaces = new List <string>(entry.Surfaces.Split(new[] { ',', ' ' }, StringSplitOptions.RemoveEmptyEntries)); List <OrthographyRule> rules = _orthography.GetRules(rulesToken); var suffix = new Suffix(id, lex, new ImmutableSortedSet <string>(surfaces), morphemeType, new ImmutableHashSet <string>(labels), rules); if (suffixesById.ContainsKey(id)) { Trace.TraceEvent(TraceEventType.Warning, 0, $"Duplicate suffix: {id}"); } else { suffixesById.Add(id, suffix); } foreach (string surface in surfaces) { suffixes.Add(surface.Replace('_', ' '), suffix); } }
private void AddSuffix(SuffixDictionaryLine entry, Dictionary<string, Suffix> suffixesById, MorphemeSurfaceDictionary<Suffix> suffixes) { string id = entry.Id; string lex = entry.Lex; MorphemeType morphemeType; if (!Enum.TryParse(entry.Type, out morphemeType)) { morphemeType = MorphemeType.O; Trace.TraceEvent(TraceEventType.Error, 0, $"Invalid Morpheme Type: {entry.Type}"); } string[] labels = entry.Labels.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); string[] rulesToken = entry.Rules.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries); Debug.Assert(entry.Surfaces != null, "entry.Surfaces != null"); var surfaces = new List<string>(entry.Surfaces.Split(new[] {',', ' '}, StringSplitOptions.RemoveEmptyEntries)); List<OrthographyRule> rules = _orthography.GetRules(rulesToken); var suffix = new Suffix(id, lex, new ImmutableSortedSet<string>(surfaces), morphemeType, new ImmutableHashSet<string>(labels), rules); if (suffixesById.ContainsKey(id)) { Trace.TraceEvent(TraceEventType.Warning, 0, $"Duplicate suffix: {id}"); } else { suffixesById.Add(id, suffix); } foreach (string surface in surfaces) { suffixes.Add(surface.Replace('_', ' '), suffix); } }
public void AddEntries(DataSet ds, string tableName, Dictionary<string, Root> rootsById, MorphemeSurfaceDictionary<Root> rootsBySurface) { var data = ds.Tables[tableName].AsEnumerable(); var entries = data.Select(x => new RootLine { Root = x.Field<string>("root"), Surfaces = x.Field<string>("surfaces") ?? "", Lex = x.Field<string>("lex"), Active = x.Field<string>("active") ?? "", Pos = x.Field<string>("Id"), Labels = x.Field<string>("flags") ?? "", Rules = x.Field<string>("rules") ?? "" }); foreach (var entry in entries) { if (entry.Active == "") { AddRoots(entry, rootsById, rootsBySurface); } } }
private MorphemeSurfaceDictionary<Root> ReadRoots() { try { var roots = new MorphemeSurfaceDictionary<Root>(); var reader = new RootLexiconReader(_orthography); string rootsPath = _dirPath + _seperator + Resources.InternalMainRootsPath; if (_external) { using (var stream = new FileStream(rootsPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) { reader.AddEntries(TextToDataSet.Convert(stream, DefaultTableName, Delimiter), DefaultTableName, roots); } return roots; } reader.AddEntries(EmbeddedTextResourceToDataSet(rootsPath), DefaultTableName, roots); string namesPath = _dirPath + _seperator + Resources.InternalPersonNamesPath; reader.AddEntries(EmbeddedTextResourceToDataSet(namesPath), DefaultTableName, roots); string abbreviationPath = _dirPath + _seperator + Resources.InternalAbbreviationsPath; reader.AddEntries(EmbeddedTextResourceToDataSet(abbreviationPath), DefaultTableName, roots); return roots; } catch (Exception ex) { throw new InvalidLanguageFileException(ex, Type.Roots, "Invalid language file for roots: "); } }
private MorphemeContainer<Root> ParseRoots(string dataTxt) { try { var rootsBySurface = new MorphemeSurfaceDictionary<Root>(); var rootsById = new Dictionary<string, Root>(); var reader = new RootLexiconReader(_orthography); using (var stream = GenerateStreamFromString(dataTxt)) { var ds = TextToDataSet.Convert(stream, DefaultTableName, Delimiter); reader.AddEntries(ds, DefaultTableName, rootsById, rootsBySurface); } return new MorphemeContainer<Root>(rootsById, rootsBySurface); } catch (Exception ex) { throw new InvalidLanguageFileException(ex, Type.Roots, "Invalid language file for roots: "); } }