static void Main(string[] args) { var corpus = new EntitiesCorpus(); var cp1251 = Encoding.GetEncoding("windows-1251"); var xml = File.ReadAllText(@"E:\SystemsTests\rco\bin\result.xml", cp1251); var text = File.ReadAllText(@"E:\SystemsTests\rco\bin\samples\modelsall_text_cp1251.txt", cp1251); corpus.ClearedText = text; var doc = XDocument.Parse(xml); var document = doc.Element("output").Element("document"); var entities = document.Elements("sentence").SelectMany(s => s.Elements("entities").SelectMany(e => e.Elements("entity"))).ToList(); foreach (var entity in entities) { var t = GetType(entity.GetAttributeValue("type")); if (t != NerType.Undefined) { var begin = entity.GetAttributeValue("offset").ParseInt(-1); var end = entity.GetAttributeValue("length").ParseInt(-1) + begin; corpus.Entities.Add(new NerTextEntity(begin, end, t)); } } File.WriteAllText("modelsall_rco_text.txt", corpus.ClearedText); File.WriteAllText("modelsall_rco.txt", corpus.Render()); Console.ReadLine(); }
private static void AddEntityToModel(EntitiesCorpus model, int begin, int endi, NerType type, Dictionary<KeyValuePair<int, int>, NerTextEntity> allEntities) { NerTextEntity existing; if (allEntities.TryGetValue(new KeyValuePair<int, int>(begin, endi), out existing) && existing.Type != type) { model.Entities.Remove(existing); } if (existing != null) { return; } var ent = new NerTextEntity(begin, endi, type); model.Entities.Add(ent); allEntities.Add(new KeyValuePair<int, int>(begin, endi), ent); }
private static void ConvertJsonToCorpus(string inputPath, string outputPath, ConcurrentBag <string> types) { var json = File.ReadAllText(inputPath); var corpus = new EntitiesCorpus(); dynamic s = JsonConvert.DeserializeObject(json); var doc = s.doc; corpus.ClearedText = doc.text; var entities = doc.entities.entity; var etypes = new Dictionary <string, string>(); if (entities != null) { foreach (dynamic entity in entities) { string id = entity.eid; if (entity.level == "NAM") { string type = entity.type; etypes.Add(id, type); } } var mentions = doc.mentions.mention; if (mentions != null) { foreach (dynamic mention in mentions) { int begin = mention.begin; int end = mention.end; string id = mention.eid; string type; if (etypes.TryGetValue(id, out type)) { types.Add(type); var t = GetType(type); if (t != NerType.Undefined) { corpus.Entities.Add(new NerTextEntity(begin, end + 1, t)); } } } } } File.WriteAllText(outputPath, corpus.Render()); }
private static void AddEntityToModel(EntitiesCorpus model, int begin, int endi, NerType type, Dictionary <KeyValuePair <int, int>, NerTextEntity> allEntities) { NerTextEntity existing; if (allEntities.TryGetValue(new KeyValuePair <int, int>(begin, endi), out existing) && existing.Type != type) { model.Entities.Remove(existing); } if (existing != null) { return; } var ent = new NerTextEntity(begin, endi, type); model.Entities.Add(ent); allEntities.Add(new KeyValuePair <int, int>(begin, endi), ent); }
private static void ConvertJsonToCorpus(string inputPath, string outputPath, ConcurrentBag<string> types) { var json = File.ReadAllText(inputPath); var corpus = new EntitiesCorpus(); dynamic s = JsonConvert.DeserializeObject(json); var doc = s.doc; corpus.ClearedText = doc.text; var entities = doc.entities.entity; var etypes = new Dictionary<string, string>(); if (entities != null) { foreach (dynamic entity in entities) { string id = entity.eid; if (entity.level == "NAM") { string type = entity.type; etypes.Add(id, type); } } var mentions = doc.mentions.mention; if (mentions != null) { foreach (dynamic mention in mentions) { int begin = mention.begin; int end = mention.end; string id = mention.eid; string type; if (etypes.TryGetValue(id, out type)) { types.Add(type); var t = GetType(type); if (t != NerType.Undefined) { corpus.Entities.Add(new NerTextEntity(begin, end + 1, t)); } } } } } File.WriteAllText(outputPath, corpus.Render()); }
static void Main(string[] args) { var corpus = new EntitiesCorpus(); var cp1251 = Encoding.GetEncoding("windows-1251"); var xml = File.ReadAllText(@"E:\SystemsTests\rco\bin\result.xml", cp1251); var text = File.ReadAllText(@"E:\SystemsTests\rco\bin\samples\modelsall_text_cp1251.txt",cp1251); corpus.ClearedText = text; var doc = XDocument.Parse(xml); var document = doc.Element("output").Element("document"); var entities = document.Elements("sentence").SelectMany(s => s.Elements("entities").SelectMany(e => e.Elements("entity"))).ToList(); foreach (var entity in entities) { var t = GetType(entity.GetAttributeValue("type")); if (t != NerType.Undefined) { var begin = entity.GetAttributeValue("offset").ParseInt(-1); var end = entity.GetAttributeValue("length").ParseInt(-1) + begin; corpus.Entities.Add(new NerTextEntity(begin, end, t)); } } File.WriteAllText("modelsall_rco_text.txt", corpus.ClearedText); File.WriteAllText("modelsall_rco.txt", corpus.Render()); Console.ReadLine(); }
static void Main(string[] args) { var model = new EntitiesCorpus(); var doc = XDocument.Parse( File.ReadAllText(@"rdf_IE_2.0_ru.xml")); var root = doc.Root; var rdfnamespace = XNamespace.Get("http://www.w3.org/1999/02/22-rdf-syntax-ns#"); var auxnamespace = XNamespace.Get("http://www.abbyy.com/ns/Aux#"); var basicentitynamespace = XNamespace.Get("http://www.abbyy.com/ns/BasicEntity#"); var nodeId = rdfnamespace.GetName("nodeID"); var resource = rdfnamespace.GetName("resource"); var annotationsName = auxnamespace.GetName("TextAnnotations"); var annotation = auxnamespace.GetName("InstanceAnnotation"); var start = auxnamespace.GetName("annotation_start"); var end = auxnamespace.GetName("annotation_end"); var instance = auxnamespace.GetName("instance"); var annotationsText = auxnamespace.GetName("document_text"); var annotations = root.Element(annotationsName); var text = annotations.Element(annotationsText).Value; File.WriteAllText("testmodeleng_text_abby.txt", text); var objects = new Dictionary <string, Tuple <int, int> >(); var allEntities = new Dictionary <KeyValuePair <int, int>, NerTextEntity>(); var basicEntityTypes = new HashSet <string>(); foreach (var e in root.Elements()) { if (e.Name.Namespace == basicentitynamespace) { basicEntityTypes.Add(e.Name.LocalName); } } foreach (var basicEntityType in basicEntityTypes) { Console.WriteLine(basicEntityType); } foreach (var ann in annotations .Elements(auxnamespace.GetName("instance_annotation")) .Select(a => a.Element(annotation)) .Where(a => a != null)) { var ints = ann.Element(instance); var idattr = ints.Attribute(nodeId); var begin = int.Parse(ann.Element(start).Value); var endi = int.Parse(ann.Element(end).Value); if (idattr != null) { var id = idattr.Value; if (!objects.ContainsKey(id)) { if (begin != endi) { objects.Add(id, Tuple.Create(begin, endi)); } } } else { var resAttr = ints.Attribute(resource); if (resAttr != null) { NerType type = GetTypeFromUriAttr(resAttr.Value, string.Empty); if (type != NerType.Undefined) { AddEntityToModel(model, begin, endi, type, allEntities); } } } } model.ClearedText = text; foreach (var xElement in root.Elements()) { NerType type = GetTypeFromUriAttr(xElement.Name.Namespace.NamespaceName, xElement.Name.LocalName); if (type != NerType.Undefined) { var idattr = xElement.Attribute(nodeId); if (idattr != null) { var val = idattr.Value; Tuple <int, int> pos; if (objects.TryGetValue(val, out pos)) { AddEntityToModel(model, pos.Item1, pos.Item2, type, allEntities); } } } } File.WriteAllText("modelsall_abby.txt", model.Render()); Console.ReadLine(); }
static void Main(string[] args) { var model = new EntitiesCorpus(); var doc = XDocument.Parse( File.ReadAllText(@"rdf_IE_2.0_ru.xml")); var root = doc.Root; var rdfnamespace = XNamespace.Get("http://www.w3.org/1999/02/22-rdf-syntax-ns#"); var auxnamespace = XNamespace.Get("http://www.abbyy.com/ns/Aux#"); var basicentitynamespace = XNamespace.Get("http://www.abbyy.com/ns/BasicEntity#"); var nodeId = rdfnamespace.GetName("nodeID"); var resource = rdfnamespace.GetName("resource"); var annotationsName = auxnamespace.GetName("TextAnnotations"); var annotation = auxnamespace.GetName("InstanceAnnotation"); var start = auxnamespace.GetName("annotation_start"); var end = auxnamespace.GetName("annotation_end"); var instance = auxnamespace.GetName("instance"); var annotationsText = auxnamespace.GetName("document_text"); var annotations = root.Element(annotationsName); var text = annotations.Element(annotationsText).Value; File.WriteAllText("testmodeleng_text_abby.txt", text); var objects = new Dictionary<string, Tuple<int, int>>(); var allEntities = new Dictionary<KeyValuePair<int, int>, NerTextEntity>(); var basicEntityTypes = new HashSet<string>(); foreach (var e in root.Elements()) { if (e.Name.Namespace == basicentitynamespace) { basicEntityTypes.Add(e.Name.LocalName); } } foreach (var basicEntityType in basicEntityTypes) { Console.WriteLine(basicEntityType); } foreach (var ann in annotations .Elements(auxnamespace.GetName("instance_annotation")) .Select(a => a.Element(annotation)) .Where(a => a != null)) { var ints = ann.Element(instance); var idattr = ints.Attribute(nodeId); var begin = int.Parse(ann.Element(start).Value); var endi = int.Parse(ann.Element(end).Value); if (idattr != null) { var id = idattr.Value; if (!objects.ContainsKey(id)) { if (begin != endi) { objects.Add(id, Tuple.Create(begin, endi)); } } } else { var resAttr = ints.Attribute(resource); if (resAttr != null) { NerType type = GetTypeFromUriAttr(resAttr.Value, string.Empty); if (type != NerType.Undefined) { AddEntityToModel(model, begin, endi, type, allEntities); } } } } model.ClearedText = text; foreach (var xElement in root.Elements()) { NerType type = GetTypeFromUriAttr(xElement.Name.Namespace.NamespaceName, xElement.Name.LocalName); if (type != NerType.Undefined) { var idattr = xElement.Attribute(nodeId); if (idattr != null) { var val = idattr.Value; Tuple<int, int> pos; if (objects.TryGetValue(val, out pos)) { AddEntityToModel(model, pos.Item1, pos.Item2, type, allEntities); } } } } File.WriteAllText("modelsall_abby.txt", model.Render()); Console.ReadLine(); }