public static IEnumerable<Triplet> GenerateTripletFlowNt(string fileName, long limit) { StreamReader sr = new StreamReader(fileName); Token.loaded = new StringBuilder(); Token tok1 = Token.NextToken(sr); long cnt = 0; while (tok1.vid != TokenEnumeration.Eos) { if (cnt >= limit) break; //sema2012m.Triplet triplet = null; Token tok2 = Token.NextToken(sr); Token tok3 = Token.NextToken(sr); Token tok4 = Token.NextToken(sr); if (tok1.vid == TokenEnumeration.Entity && tok2.vid == TokenEnumeration.Entity && tok3.vid == TokenEnumeration.Entity && tok4.vid == TokenEnumeration.Point) { yield return new sema2012m.OProp(tok1.Value, tok2.Value, tok3.Value); } else if (tok1.vid == TokenEnumeration.Entity && tok2.vid == TokenEnumeration.Entity && tok3.vid == TokenEnumeration.Literal && tok4.vid == TokenEnumeration.Point) { var dprop = new sema2012m.DProp(tok1.Value, tok2.Value, tok3.Value); if (tok3.lang != null) dprop.lang = tok3.lang; yield return dprop; } else { Console.WriteLine("Err in triplet {0} {1} {2} {3}\n [{4}]", tok1.Value, tok2.Value, tok3.Value, tok4.Value, Token.loaded.ToString()); } cnt++; if (cnt % 10000 == 0) Console.WriteLine(cnt); Token.loaded = new StringBuilder(); tok1 = Token.NextToken(sr); } }
public static void GenerateTripletFlowTsv(string fileName, long limit, sema2012m.Engine engine, sema2012m.LogLine turlog) { engine.InitAdapterBuffers(); StreamReader sr = new StreamReader(fileName); Ntriplets = 0; string line; while ((line=sr.ReadLine()) != null) { if (Ntriplets > limit) break; if (Ntriplets % 200000 == 0) turlog("Загружено: " + Ntriplets + " триплетов."); string[] parts = line.Split('\t'); if (parts == null || parts.Length < 4) { continue; } string subj_ = parts[1]; string pred_ = parts[2]; string obj_ = parts[3]; string subj = ConvertEntityId(subj_.Substring(1, subj_.Length - 2)); string pred = pred_[0] == '<' ? pred_.Substring(1, pred_.Length - 2) : pred_; // Надо бы преобразовать конструкции вида rdfs:label в нормальную форму... // Смотрим на объект и определяем вид утверждения char c = obj_[0]; if (c == '<') // объектное свойство { Ntriplets++; //yield return new sema2012m.OProp(subj, pred, obj_.Substring(1, obj_.Length - 2).Replace('\'', '_')); engine.AddTripletToBuffer(new sema2012m.OProp(subj, pred, ConvertEntityId(obj_.Substring(1, obj_.Length - 2)))); } else if (c == '\"') // данные { int pos = obj_.LastIndexOf('\"'); string lang = null; if (pos < obj_.Length - 3 && obj_[pos + 1] == '@') { lang = obj_.Substring(pos + 2); } var dp = new sema2012m.DProp(subj, pred, obj_.Substring(1, pos - 1)); if (lang != null) dp.lang = lang; Ntriplets++; //yield return dp; engine.AddTripletToBuffer(dp); } else // неизвестно что { } } engine.FlushAdapterBuffers(); }