Example #1
0
 static void RetrieveDocuments(string sourceUrl, string lbl, string corpusFileOut, string cacheFileOut, string timeStart, string timeEnd)
 {
     Debug.Assert(!lbl.Contains("\t"));
     StreamWriter corpus = new StreamWriter(corpusFileOut, /*append=*/Utils.VerifyFileNameOpen(corpusFileOut), Encoding.UTF8);
     StreamWriter cache = new StreamWriter(cacheFileOut, /*append=*/Utils.VerifyFileNameOpen(cacheFileOut));
     DataService service = new DataService();
     Console.WriteLine("Retrieving document references ...");
     string[][] docRefs = service.GetDocRefs(sourceUrl, timeStart, timeEnd);
     int i = 0;
     foreach (string[] row in docRefs)
     {
         string time = row[0];
         string corpusId = row[1];
         string docId = row[2];
         i++;
         string cacheKey = corpusId + "\t" + docId + "\t" + lbl;
         if (!mCache.Contains(cacheKey))
         {
             Console.WriteLine("Retrieving document # {0} / {1} ...", i, docRefs.Length);
             try
             {
                 string txt = service.GetDoc(corpusId, docId, "txt", false/*ignored*/, /*changesOnly=*/false, time);
                 if (!txt.StartsWith("*** "))
                 {
                     txt = Utils.ToOneLine(txt, /*compact=*/true).Replace('\t', ' ');
                     corpus.WriteLine(lbl + "\t" + txt);
                     corpus.Flush();
                     cache.WriteLine(cacheKey);
                     cache.Flush();
                     mCache.AddRange(GenerateCacheKeys(corpusId, docId, lbl));
                 }
                 else
                 {
                     Console.WriteLine(txt); // error message from the service
                 }
             }
             catch (Exception e)
             {
                 Console.WriteLine(e.Message);
                 Console.WriteLine(e.StackTrace);
             }
         }
         else
         {
             Console.WriteLine("*** Document found in cache.");
         }
     }
     corpus.Close();
     cache.Close();
 }