Example #1
0
        public ActionResult Index()
        {
            /* HashSet<string> hs = new HashSet<string>();
             * var query = from KeywordsTotal in db.keywordsTotal
             *           select KeywordsTotal;
             * for (int i = 0; i < query.Count(); i++)
             * {
             *   hs.Add(query.ElementAt(i).keyword);
             *
             * }*/

            //Linq 语法 计算在likes 中ItemID文章的总like数目,也可用EF方法实现,比较麻烦
            // int x = db.likes.Count(like => like.ItemId ==101);

            /******************************
            *  访问rss的地址,读取xml数据
            ******************************/
            rss newsItems = null;
            //  System.Net.WebClient client = new WebClient();
            //  byte[] page = client.DownloadData("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");
            // string path = System.Text.Encoding.UTF8.GetString(page);
            //string path = "cars.xml";

            XmlSerializer serializer = new XmlSerializer(typeof(rss));

            //HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");
            //使用Cookie设置AllowAutoRedirect属性为false,是解决“尝试自动重定向的次数太多。”的核心
            // request.CookieContainer = new CookieContainer();
            // request.AllowAutoRedirect = false;
            //  WebResponse response = (WebResponse)request.GetResponse();
            //  Stream sm = response.GetResponseStream();
            //  System.IO.StreamReader streamReader = new System.IO.StreamReader(sm);
            //将流转换为字符串
            // string html = streamReader.ReadToEnd();
            // streamReader.Close();

            //  TextReader reader = new StreamReader(path);

            XmlReader reader = new XmlTextReader("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");

            newsItems = (rss)serializer.Deserialize(reader);

            /*var serializer = new XmlSerializer(typeof(rss));
             * using (TextReader reader = new StringReader(html))
             * {
             *    cars = (rss)serializer.Deserialize(reader);
             * }*/


            //  reader.Close();


            /******************************
            *  初始化最新时间newTime
            ******************************/
            DateTime newTime;

            if (db.sources.Find("NYTimes") == null)
            {
                string httpTime = newsItems.item[newsItems.item.Length - 1].pubDate;
                newTime = DateTime.Parse(httpTime);
                Models.source src = new Models.source("NYTimes", newTime);
                db.sources.Add(src);
            }
            else
            {
                Models.source src = db.sources.Find("NYTimes");
                newTime = src.newDate.Value;
            }


            /******************************
            *  循环添加每一条新闻条目,只添加新条目
            ******************************/
            for (var i = newsItems.item.Length - 1; i >= 0; i--)    //old item store into database first
            {
                string   httpTime = newsItems.item[i].pubDate;
                DateTime time     = DateTime.Parse(httpTime);


                // 每次添加新条目前,先与source里的最新时间对比
                if (time <= newTime)       //time值小于最新时间,舍弃
                {
                    continue;
                }
                else
                {
                    Models.source src = db.sources.Find("NYTimes");
                    src.newDate = time;    //更新时间
                }
                // description里面会带有<和> 之间的多余内容,例如广告,使用正则表达式可以消除掉
                newsItems.item[i].description = Regex.Replace(newsItems.item[i].description, "<.*?>", string.Empty);
                string text = newsItems.item[i].title + " " + newsItems.item[i].description;
                text = text.ToLower();

                text = non1.Replace(text, "");
                text = non2.Replace(text, "");
                text = non3.Replace(text, "");
                text = non4.Replace(text, "");
                text = non5.Replace(text, "");
                text = non6.Replace(text, "");
                text = non7.Replace(text, "");
                text = non8.Replace(text, "");
                text = non9.Replace(text, "");
                text = non10.Replace(text, "");
                text = non11.Replace(text, "");
                text = non12.Replace(text, "");
                text = non13.Replace(text, "");
                text = non14.Replace(text, "");
                text = non15.Replace(text, "");
                text = non16.Replace(text, "");
                text = non17.Replace(text, "");
                text = non18.Replace(text, "");
                text = non19.Replace(text, "");
                text = non20.Replace(text, "");
                char[]   sp    = new Char[] { ',', '.', ' ', '?', ':', '\'', '‘', '’', '|' };
                string[] words = text.Split(sp, StringSplitOptions.RemoveEmptyEntries);

                // string[] words = text.Split(' ');


                words[0] = Regex.Replace(words[0], "[\\s\\p{P}\n\r=<>$>+¥^]", "");
                words[1] = Regex.Replace(words[1], "[\\s\\p{P}\n\r=<>$>+¥^]", "");
                words[2] = Regex.Replace(words[2], "[\\s\\p{P}\n\r=<>$>+¥^]", "");


                Models.item item = new Models.item(newsItems.item[i], time, "NYTimes", 0, words[0], words[1], words[2]); // "");

                db.items.Add(item);                                                                                      //item include 4 elements
                db.SaveChanges();


                for (int j = 0; j < 3; j++)
                {
                    db.articleKeyword.Add(new ArticleKeyword(words[j], item.Id));

                    if (db.keywordsTotal.Find(words[j]) != null)
                    {
                        db.keywordsTotal.Find(words[j]).keywordSum++;
                    }
                    else
                    {
                        db.keywordsTotal.Add(new KeywordsTotal(words[j], 1));
                    }
                    //db.SaveChanges();
                }

                //db.channel.Add(cars.item[i]);
            }
            db.SaveChanges();

            /*var query = from item in db.items
             *           where item.imgId == ""
             *           select item;
             *
             * foreach (Models.item item in query)
             * {
             *  item.imgId = "i" + item.Id;
             * }
             * db.SaveChanges();*/
            return(View());
        }
Example #2
0
        //
        // GET: /News/

        public ActionResult Index()
        {
            //Linq 语法 计算在likes 中ItemID文章的总like数目,也可用EF方法实现,比较麻烦
            // int x = db.likes.Count(like => like.ItemId ==101);

            /******************************
            *  访问rss的地址,读取xml数据
            ******************************/
            rss cars = null;
            //  System.Net.WebClient client = new WebClient();
            //  byte[] page = client.DownloadData("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");
            // string path = System.Text.Encoding.UTF8.GetString(page);
            //string path = "cars.xml";

            XmlSerializer serializer = new XmlSerializer(typeof(rss));

            //HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");
            //使用Cookie设置AllowAutoRedirect属性为false,是解决“尝试自动重定向的次数太多。”的核心
            // request.CookieContainer = new CookieContainer();
            // request.AllowAutoRedirect = false;
            //  WebResponse response = (WebResponse)request.GetResponse();
            //  Stream sm = response.GetResponseStream();
            //  System.IO.StreamReader streamReader = new System.IO.StreamReader(sm);
            //将流转换为字符串
            // string html = streamReader.ReadToEnd();
            // streamReader.Close();

            //  TextReader reader = new StreamReader(path);

            XmlReader reader = new XmlTextReader("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");

            cars = (rss)serializer.Deserialize(reader);

            /*var serializer = new XmlSerializer(typeof(rss));
             * using (TextReader reader = new StringReader(html))
             * {
             *    cars = (rss)serializer.Deserialize(reader);
             * }*/


            //  reader.Close();


            /******************************
            *  初始化最新时间newTime
            ******************************/
            DateTime newTime;

            if (db.sources.Find("NYTimes") == null)
            {
                //db.sources.Add(new Models.source("NYTimes", new DateTime(2000,1,1)));    //Initialize database
                db.sources.Add(new Models.source("NYTimes", DateTime.Parse("Sat, 07 Feb 2015 00:57:00 GMT")));
            }
            //db.sources.Add(new source { srcName = "NYTimes", newDate = DateTime.Parse("Sat, 07 Feb 2015 00:57:00 GMT") });
            //db.SaveChanges();

            if (db.sources.Find("NYTimes").newDate.Value < DateTime.Parse("Sat, 07 Feb 2015 00:57:00 GMT"))
            {
                string httpTime = cars.item[cars.item.Length - 1].pubDate;
                newTime = DateTime.Parse(httpTime);
                Models.source src = new Models.source("NYTimes", newTime);
                db.sources.Add(src);
            }
            else
            {
                Models.source src = db.sources.Find("NYTimes");
                newTime = src.newDate.Value;
            }


            /******************************
            *  循环添加每一条新闻条目,只添加新条目
            ******************************/

            for (var i = cars.item.Length - 1; i >= 0; i--)    //old item store into database first
            {
                string   httpTime = cars.item[i].pubDate;
                DateTime time     = DateTime.Parse(httpTime);


                // 每次添加新条目前,先与source里的最新时间对比
                if (time <= newTime)       //time值小于最新时间,舍弃
                {
                    continue;
                }
                else
                {
                    Models.source src = db.sources.Find("NYTimes");
                    src.newDate = time;    //更新时间
                }

                // description里面会带有<和> 之间的多余内容,例如广告,使用正则表达式可以消除掉
                cars.item[i].description = Regex.Replace(cars.item[i].description, "<.*?>", string.Empty);


                Models.item item = new Models.item(cars.item[i], time, "NYTimes", 0);

                db.items.Add(item);               //item include 4 elements

                db.SaveChanges();                 //save DB before calling other function !!


                /**********************************
                * 添加每篇文章同时对keyword表和artKey表进行统计
                * ********************************/
                KeywordAnalyzer ka = new KeywordAnalyzer();

                ka.analyze(item);                   //这里保存的数据库结果,不会传到view的ToList里?TFIDF

                ka.TFIDF(item.Id);                  //随着数据越多,TFIDF效果会越来越精确


                /*****************************
                * 把结果存进article表中。不能放在子函数,否则传不进View?
                *****************************/
                var query2 = db.artKeys
                             .Where(x => x.AId == item.Id)
                             .OrderByDescending(x => x.TFIDF)
                             .Take(3);                                  //获得排序最高的三个关键词
                string str = "";
                foreach (var line in query2)
                {
                    str = str + line.word + ",";
                }

                db.items.Find(item.Id).keyword = str;
                db.SaveChanges();
            }


            //db.SaveChanges();
            return(View(db.items.ToList()));
        }
Example #3
0
        //
        // GET: /Xml2Model/

        public ActionResult Index()
        {
            //Linq 语法 计算在likes 中ItemID文章的总like数目,也可用EF方法实现,比较麻烦
            // int x = db.likes.Count(like => like.ItemId ==101);

            /******************************
            *  访问rss的地址,读取xml数据
            ******************************/
            rss cars = null;
            //  System.Net.WebClient client = new WebClient();
            //  byte[] page = client.DownloadData("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");
            // string path = System.Text.Encoding.UTF8.GetString(page);
            //string path = "cars.xml";

            XmlSerializer serializer = new XmlSerializer(typeof(rss));

            //HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create("http://rss.nytimes.com/services/xml/rss/nyt/US.xml");
            //使用Cookie设置AllowAutoRedirect属性为false,是解决“尝试自动重定向的次数太多。”的核心
            // request.CookieContainer = new CookieContainer();
            // request.AllowAutoRedirect = false;
            //  WebResponse response = (WebResponse)request.GetResponse();
            //  Stream sm = response.GetResponseStream();
            //  System.IO.StreamReader streamReader = new System.IO.StreamReader(sm);
            //将流转换为字符串
            // string html = streamReader.ReadToEnd();
            // streamReader.Close();

            //  TextReader reader = new StreamReader(path);

            XmlReader reader = new XmlTextReader("http://rss.cnn.com/rss/cnn_us.rss");

            cars = (rss)serializer.Deserialize(reader);

            /*var serializer = new XmlSerializer(typeof(rss));
             * using (TextReader reader = new StringReader(html))
             * {
             *    cars = (rss)serializer.Deserialize(reader);
             * }*/


            //  reader.Close();


            /******************************
            *  初始化最新时间newTime
            ******************************/
            DateTime newTime;

            if (db.sources.Find("CNN") == null)
            {
                string httpTime = cars.item[cars.item.Length - 1].pubDate;
                newTime = DateTime.ParseExact(httpTime, "ddd, dd MMM yyyy HH:mm:ss EST", new CultureInfo("en-US")).AddHours(-1);
                Models.source src = new Models.source("CNN", newTime);
                db.sources.Add(src);
            }
            else
            {
                Models.source src = db.sources.Find("CNN");
                newTime = src.newDate.Value;
            }


            /******************************
            *  循环添加每一条新闻条目,只添加新条目
            ******************************/
            for (var i = cars.item.Length - 1; i >= 0; i--)    //old item store into database first
            {
                string   httpTime = cars.item[i].pubDate;
                DateTime time     = DateTime.ParseExact(httpTime, "ddd, dd MMM yyyy HH:mm:ss EST", new CultureInfo("en-US")).AddHours(-1);


                // 每次添加新条目前,先与source里的最新时间对比
                if (time <= newTime)       //time值小于最新时间,舍弃
                {
                    continue;
                }
                else
                {
                    Models.source src = db.sources.Find("CNN");
                    src.newDate = time;    //更新时间
                }


                Models.item item = new Models.item(cars.item[i], time, "CNN", 0, "", "", ""); //, "");

                db.items.Add(item);                                                           //item include 4 elements
                //db.channel.Add(cars.item[i]);
            }
            db.SaveChanges();


            /*var query = from item in db.items
             *          where item.imgId == ""
             *          select item;
             *
             * foreach (Models.item item in query)
             * {
             *  item.imgId = "i" + item.Id;
             * }
             * db.SaveChanges();*/
            return(View());
        }