Ejemplo n.º 1
0
        public static IEnumerable<Triplet> GenerateTripletFlowNt(string fileName, long limit)
        {
            StreamReader sr = new StreamReader(fileName);

            Token.loaded = new StringBuilder();
            Token tok1 = Token.NextToken(sr);
            long cnt = 0;
            while (tok1.vid != TokenEnumeration.Eos)
            {
                if (cnt >= limit) break;
                //sema2012m.Triplet triplet = null;
                Token tok2 = Token.NextToken(sr);
                Token tok3 = Token.NextToken(sr);
                Token tok4 = Token.NextToken(sr);
                if (tok1.vid == TokenEnumeration.Entity &&
                    tok2.vid == TokenEnumeration.Entity &&
                    tok3.vid == TokenEnumeration.Entity &&
                    tok4.vid == TokenEnumeration.Point)
                {
                    yield return new sema2012m.OProp(tok1.Value, tok2.Value, tok3.Value);
                }
                else if (tok1.vid == TokenEnumeration.Entity &&
                    tok2.vid == TokenEnumeration.Entity &&
                    tok3.vid == TokenEnumeration.Literal &&
                    tok4.vid == TokenEnumeration.Point)
                {
                    var dprop = new sema2012m.DProp(tok1.Value, tok2.Value, tok3.Value);
                    if (tok3.lang != null) dprop.lang = tok3.lang;
                    yield return dprop;
                }
                else
                {
                    Console.WriteLine("Err in triplet {0} {1} {2} {3}\n [{4}]",
                        tok1.Value, tok2.Value, tok3.Value, tok4.Value, Token.loaded.ToString());
                }
                cnt++;
                if (cnt % 10000 == 0) Console.WriteLine(cnt);
                Token.loaded = new StringBuilder();
                tok1 = Token.NextToken(sr);
            }
        }
Ejemplo n.º 2
0
 public static void GenerateTripletFlowTsv(string fileName, long limit, sema2012m.Engine engine, sema2012m.LogLine turlog)
 {
     engine.InitAdapterBuffers();
     StreamReader sr = new StreamReader(fileName);
     Ntriplets = 0;
     string line;
     while ((line=sr.ReadLine()) != null)
     {
         if (Ntriplets > limit) break;
         if (Ntriplets % 200000 == 0) turlog("Загружено: " + Ntriplets + " триплетов.");
         string[] parts = line.Split('\t');
         if (parts == null || parts.Length < 4) { continue; }
         string subj_ = parts[1];
         string pred_ = parts[2];
         string obj_ = parts[3];
         string subj = ConvertEntityId(subj_.Substring(1, subj_.Length - 2));
         string pred = pred_[0] == '<' ? pred_.Substring(1, pred_.Length - 2) : pred_;
         // Надо бы преобразовать конструкции вида rdfs:label в нормальную форму...
         // Смотрим на объект и определяем вид утверждения
         char c = obj_[0];
         if (c == '<') // объектное свойство
         {
             Ntriplets++;
             //yield return new sema2012m.OProp(subj, pred, obj_.Substring(1, obj_.Length - 2).Replace('\'', '_'));
             engine.AddTripletToBuffer(new sema2012m.OProp(subj, pred,
                 ConvertEntityId(obj_.Substring(1, obj_.Length - 2))));
         }
         else if (c == '\"') // данные
         {
             int pos = obj_.LastIndexOf('\"');
             string lang = null;
             if (pos < obj_.Length - 3 && obj_[pos + 1] == '@')
             {
                 lang = obj_.Substring(pos + 2);
             }
             var dp = new sema2012m.DProp(subj, pred, obj_.Substring(1, pos - 1));
             if (lang != null) dp.lang = lang;
             Ntriplets++;
             //yield return dp;
             engine.AddTripletToBuffer(dp);
         }
         else // неизвестно что
         {
         }
     }
     engine.FlushAdapterBuffers();
 }