Пример #1
0
        private void ReadChannelAttributes(string url, string xml, DateTime timeStart, Dictionary <string, string> channelAttr)
        {
            XmlTextReader reader = new XmlTextReader(new StringReader(xml));

            mLogger.Info("ProduceData", "Reading channel attributes ...");
            while (reader.Read())
            {
                if (reader.NodeType == XmlNodeType.Element && reader.Name == "channel" && !reader.IsEmptyElement)
                {
                    // handle channel
                    while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "channel"))
                    {
                        if (reader.NodeType == XmlNodeType.Element)
                        {
                            // handle channel attributes
                            if (mChannelElements.Contains(reader.Name))
                            {
                                string attrName = reader.Name;
                                string value    = Utils.XmlReadValue(reader, attrName);
                                string oldValue;
                                if (attrName == "pubDate")
                                {
                                    string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null)
                                    {
                                        value = tmp;
                                    }
                                }
                                if (channelAttr.TryGetValue(attrName, out oldValue))
                                {
                                    channelAttr[attrName] = oldValue + " ;; " + value;
                                }
                                else
                                {
                                    channelAttr.Add(attrName, value);
                                }
                            }
                            else
                            {
                                Utils.XmlSkip(reader, reader.Name);
                            }
                        }
                    }
                }
            }
            reader.Close();
            channelAttr.Add("siteId", mSiteId);
            channelAttr.Add("provider", GetType().ToString());
            channelAttr.Add("sourceUrl", url);
            if (mIncludeRssXml)
            {
                channelAttr.Add("source", xml);
            }
            channelAttr.Add("timeBetweenPolls", TimeBetweenPolls.ToString());
            channelAttr.Add("timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE));
        }
Пример #2
0
 protected override object ProduceData()
 {
     for (int i = 0; i < mSources.Count; i++)
     {
         string url = mSources[i];
         try
         {
             DateTime timeStart = DateTime.Now;
             // get RSS XML
             string xml;
             try
             {
                 mLogger.Info("ProduceData", "Getting RSS XML from {0} ...", url);
                 xml = WebUtils.GetWebPageDetectEncoding(url);
                 xml = FixXml(xml);
             }
             catch (Exception e)
             {
                 mLogger.Error("ProduceData", e);
                 return(null);
             }
             Dictionary <string, string> channelAttr = new Dictionary <string, string>();
             ArrayList <DocumentCorpus>  corpora     = new ArrayList <DocumentCorpus>(new DocumentCorpus[] { new DocumentCorpus() });
             corpora.Last.Features.SetFeatureValue("guid", Guid.NewGuid().ToString());
             XmlTextReader reader = new XmlTextReader(new StringReader(xml));
             // first pass: items
             mLogger.Info("ProduceData", "Reading items ...");
             while (reader.Read())
             {
                 if (reader.NodeType == XmlNodeType.Element && reader.Name == "item" && !reader.IsEmptyElement)
                 {
                     Dictionary <string, string> itemAttr = new Dictionary <string, string>();
                     while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "item"))
                     {
                         if (reader.NodeType == XmlNodeType.Element)
                         {
                             // handle item attributes
                             if (mItemElements.Contains(reader.Name))
                             {
                                 string attrName = reader.Name;
                                 string value    = Utils.XmlReadValue(reader, attrName);
                                 if (value.Trim() != "")
                                 {
                                     string oldValue;
                                     if (attrName == "pubDate")
                                     {
                                         string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null)
                                         {
                                             value = tmp;
                                         }
                                     }
                                     if (itemAttr.TryGetValue(attrName, out oldValue))
                                     {
                                         itemAttr[attrName] = oldValue + " ;; " + value;
                                     }
                                     else
                                     {
                                         itemAttr.Add(attrName, value);
                                     }
                                 }
                             }
                             else
                             {
                                 Utils.XmlSkip(reader, reader.Name);
                             }
                         }
                     }
                     // stopped?
                     if (mStopped)
                     {
                         if (corpora[0].Documents.Count == 0)
                         {
                             return(null);
                         }
                         break;
                     }
                     ProcessItem(itemAttr, corpora, url);
                 }
             }
             reader.Close();
             reader = new XmlTextReader(new StringReader(xml));
             if (corpora[0].Documents.Count > 0)
             {
                 // second pass: channel attributes
                 mLogger.Info("ProduceData", "Reading channel attributes ...");
                 while (reader.Read())
                 {
                     if (reader.NodeType == XmlNodeType.Element && reader.Name == "channel" && !reader.IsEmptyElement)
                     {
                         // handle channel
                         while (reader.Read() && !(reader.NodeType == XmlNodeType.EndElement && reader.Name == "channel"))
                         {
                             if (reader.NodeType == XmlNodeType.Element)
                             {
                                 // handle channel attributes
                                 if (mChannelElements.Contains(reader.Name))
                                 {
                                     string attrName = reader.Name;
                                     string value    = Utils.XmlReadValue(reader, attrName);
                                     string oldValue;
                                     if (attrName == "pubDate")
                                     {
                                         string tmp = Utils.NormalizeDateTimeStr(value); if (tmp != null)
                                         {
                                             value = tmp;
                                         }
                                     }
                                     if (channelAttr.TryGetValue(attrName, out oldValue))
                                     {
                                         channelAttr[attrName] = oldValue + " ;; " + value;
                                     }
                                     else
                                     {
                                         channelAttr.Add(attrName, value);
                                     }
                                 }
                                 else
                                 {
                                     Utils.XmlSkip(reader, reader.Name);
                                 }
                             }
                         }
                     }
                 }
                 reader.Close();
                 channelAttr.Add("siteId", mSiteId);
                 channelAttr.Add("provider", GetType().ToString());
                 channelAttr.Add("sourceUrl", url);
                 if (mIncludeRssXml)
                 {
                     channelAttr.Add("source", xml);
                 }
                 channelAttr.Add("timeBetweenPolls", TimeBetweenPolls.ToString());
                 channelAttr.Add("timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE));
                 channelAttr.Add("timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
                 int newItems = 0;
                 foreach (DocumentCorpus corpus in corpora)
                 {
                     newItems += corpus.Documents.Count;
                     foreach (KeyValuePair <string, string> attr in channelAttr)
                     {
                         corpus.Features.SetFeatureValue(attr.Key, attr.Value);
                     }
                 }
                 mLogger.Info("ProduceData", "{0} new items.", newItems);
                 // dispatch data
                 foreach (DocumentCorpus corpus in corpora)
                 {
                     DispatchData(corpus);
                 }
             }
             else
             {
                 mLogger.Info("ProduceData", "No new items.");
             }
             // stopped?
             if (mStopped)
             {
                 return(null);
             }
         }
         catch (Exception e)
         {
             mLogger.Error("ProduceData", e);
         }
     }
     return(null);
 }