Exemple #1
0
        public static void msdn_channel9process(string path, string link)
        {
            Denoising de = new Denoising();

            de.Path = path;
            de.Work();                                    //  work后就获得了原始数据,去噪数据等
            MainWindow._rawdata       = de.Rawdata;       //  获得原始数据
            MainWindow._denoisingdata = de.Denoisingdata; // 获得去噪数据
            if (MainWindow._denoisingdata == null)        //  如果得到的去噪数据是null,说明可能存在编码或其他一些问题
            {
                MainWindow._denoisingdata = "";
            }
            //  初始化一个用来分词的类,对去噪数据进行分词处理
            WordSegment ws = new WordSegment();

            //bool isCHineseWebPage = WebTesting(MainWindow._denoisingdata);
            MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor);
            foreach (string word in MainWindow._wordSegmentResult)
            {
                if (MainWindow.wordFreq.ContainsKey(word))
                {
                    int freq = (int)MainWindow.wordFreq[word];
                    MainWindow.wordFreq[word] = freq + 1;
                }
                else
                {
                    MainWindow.wordFreq.Add(word, 1);
                }
            }
            MainWindow._tags      = new List <string>();
            MainWindow.tagPreview = new Hashtable();
            //  初始化一个数据挖掘类的实例
            DataMining dm = new DataMining();

            //  获得原始数据和去噪数据
            dm.Rawdata       = MainWindow._rawdata;
            dm.Denoisingdata = MainWindow._denoisingdata;
            //  开始工作
            dm.Work();
            //  获得标题,作者,发布日期,其实很不准确
            MainWindow._title    = dm.Title;
            MainWindow._author   = dm.Author;
            MainWindow._postdate = dm.Postdate;
            Page     pg     = new Page();
            PageItem pgitem = new PageItem();

            MainWindow.curwid = MainWindow.getWebpageNo() + 1;
            pgitem.setwid(MainWindow.curwid);
            //item1.settitle(MainWindow._title);
            //pgitem.settitle(translate(MainWindow._title));
            pgitem.settitle(MainWindow._title);
            pgitem.setauthor(MainWindow._author);
            pgitem.setpostdate(MainWindow._postdate);
            pgitem.setlink(MainWindow.curUrl);
            pgitem.setreferred("");
            pg.savePage(pgitem);

            HtmlDocument doc = new HtmlDocument();

            try
            {
                doc.Load(path, Encoding.UTF8);
            }
            catch (Exception e)
            {
                return;
            }
            HtmlNode           rootNode          = doc.DocumentNode;
            string             CategoryListXPath = "//title|//img";
            HtmlNodeCollection categoryNodeList  = rootNode.SelectNodes(CategoryListXPath);

            foreach (HtmlNode child in categoryNodeList)
            {
                HtmlNode hn = HtmlNode.CreateNode(child.OuterHtml);
                if (hn.SelectSingleNode("//title") != null)
                {
                    MainWindow._title = hn.SelectSingleNode("//title").InnerText;
                }
                else if (hn.SelectSingleNode("//*[@class=\"thumb\"]") != null)
                {
                    string str       = hn.SelectSingleNode("//*[@class=\"thumb\"]").OuterHtml;
                    string reg       = "(?<=alt=\")[^\"]*\"";
                    string key_words = Regex.Match(str, reg).Value;
                    MainWindow._tags.Add(key_words);
                }
            }
            //save to table TAGS
            foreach (string tag in MainWindow._tags)
            {
                if (!MainWindow._occurredTags.Contains(tag))
                {
                    Tag     thistag = new Tag();
                    TagItem tgitem  = new TagItem();
                    tgitem.settid(MainWindow.getTagNo() + 1);
                    //MainWindow._occurredTags.Add(translate(tag));
                    MainWindow._occurredTags.Add(tag);
                    //tgitem.setname(translate(tag));
                    tgitem.setname(tag);
                    thistag.saveTag(tgitem);
                }
            }

            foreach (string tag in MainWindow._tags)
            {
                Preview     b     = new Preview();
                PreviewItem item3 = new PreviewItem();
                //MainWindow.curtid = MainWindow.getTagNo(translate(tag));
                MainWindow.curtid = MainWindow.getTagNo(tag);
                item3.settid(MainWindow.curtid);
                item3.setwid(MainWindow.curwid);
                //item3.setpreview(translate((string)tagPreview[tag]));
                item3.setpreview((string)MainWindow.tagPreview[tag]);
                //item3.setpreview((string)tagPreview[tag]);
                b.savePreview(item3);
            }
        }
Exemple #2
0
        public static void Process(string path, bool needWriteToDB)   // @path:文件路径 @needWriteToDB:表示是否需要写入数据库
        {
            //  初始化一个用来去噪的类
            Denoising de = new Denoising();

            de.Path = path;
            de.Work();                                    //  work后就获得了原始数据,去噪数据等
            MainWindow._rawdata       = de.Rawdata;       //  获得原始数据
            MainWindow._denoisingdata = de.Denoisingdata; // 获得去噪数据
            if (MainWindow._denoisingdata == null)        //  如果得到的去噪数据是null,说明可能存在编码或其他一些问题
            {
                MainWindow._denoisingdata = "";
            }
            //  初始化一个用来分词的类,对去噪数据进行分词处理
            WordSegment ws = new WordSegment();

            //bool isCHineseWebPage = WebTesting(MainWindow._denoisingdata);
            MainWindow._wordSegmentResult = ws.cutwords(MainWindow._denoisingdata, WordSegmentData.preWordSegmentor);
            foreach (string word in MainWindow._wordSegmentResult)
            {
                if (MainWindow.wordFreq.ContainsKey(word))
                {
                    int freq = (int)MainWindow.wordFreq[word];
                    MainWindow.wordFreq[word] = freq + 1;
                }
                else
                {
                    MainWindow.wordFreq.Add(word, 1);
                }
            }

            /*if (isCHineseWebPage)   //  只对中文页面采用分词算法
             * {
             *  MainWindow._wordSegmentResult = ws.DoWordSegment(MainWindow._denoisingdata);   //  使用分词算法进行分词
             *
             *  //  对分词结果中每个词的词频进行统计
             *  string word;
             *  Regex r = new Regex(@"\{<([^>]*)>\}", RegexOptions.Multiline);
             *  MatchCollection matches = r.Matches(MainWindow._wordSegmentResult);
             *  foreach (Match match in matches)
             *  {
             *      if (match.Success)
             *      {
             *          word = match.Groups[1].Value;
             *          if (MainWindow.wordFreq.ContainsKey(word))
             *          {
             *              int freq = (int)MainWindow.wordFreq[word];
             *              MainWindow.wordFreq[word] = freq + 1;
             *          }
             *          else
             *          {
             *              MainWindow.wordFreq.Add(word, 1);
             *          }
             *      }
             *  }
             * }
             * else    //  对英文页面,直接采用分隔符分离的方法
             * {
             *  MainWindow._wordSegmentResult = MainWindow._denoisingdata;
             *  string[] words = MainWindow._wordSegmentResult.Split(new char[] {' ', '`', '~', '!', '@', '$', '%',
             *              '^', '&', '*','(', ')', '_', '+', '=', '{', '[', '}', ']', ':', ';', '\"', '\'', ',',
             *              '<', '.', '>', '?', '/', '\\', '\r', '\n', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'});
             *  foreach (string word in words)
             *  {
             *      if (word.Length > 1)
             *      {
             *          if (MainWindow.wordFreq.ContainsKey(word.ToLower()))
             *          {
             *              int freq = (int)MainWindow.wordFreq[word.ToLower()];
             *              MainWindow.wordFreq[word.ToLower()] = freq + 1;
             *          }
             *          else
             *          {
             *              MainWindow.wordFreq.Add(word.ToLower(), 1);
             *          }
             *      }
             *  }
             * }*/

            //  数据挖掘
            //  先清空两个容器

            MainWindow._tags      = new List <string>();
            MainWindow.tagPreview = new Hashtable();
            //  初始化一个数据挖掘类的实例
            DataMining dm = new DataMining();

            //  获得原始数据和去噪数据
            dm.Rawdata       = MainWindow._rawdata;
            dm.Denoisingdata = MainWindow._denoisingdata;
            //  开始工作
            dm.Work();
            //  获得标题,作者,发布日期,其实很不准确
            MainWindow._title    = dm.Title;
            MainWindow._author   = dm.Author;
            MainWindow._postdate = dm.Postdate;

            //  已经得到了想要的信息,开始
            //DB OPERATION
            //信息已经提取完成,存入数据库,包括tag webpage tag_webpage 3张表
            //请把新的tag存入tag表中,注意编号
            if (needWriteToDB)
            {
                /*suiyuhao*/
                //save to table PAGE
                Page     pg     = new Page();
                PageItem pgitem = new PageItem();
                MainWindow.curwid = MainWindow.getWebpageNo() + 1;
                pgitem.setwid(MainWindow.curwid);
                //item1.settitle(MainWindow._title);
                //pgitem.settitle(translate(MainWindow._title));
                pgitem.settitle(MainWindow._title);
                pgitem.setauthor(MainWindow._author);
                pgitem.setpostdate(MainWindow._postdate);
                pgitem.setlink(MainWindow.curUrl);
                pgitem.setreferred("");
                pg.savePage(pgitem);
                string body     = dealhtml(MainWindow._rawdata);
                string keywords = "";

                //save to table TAGS
                foreach (string tag in MainWindow._tags)
                {
                    if (!MainWindow._occurredTags.Contains(tag))
                    {
                        Tag     thistag = new Tag();
                        TagItem tgitem  = new TagItem();
                        tgitem.settid(MainWindow.getTagNo() + 1);
                        //MainWindow._occurredTags.Add(translate(tag));
                        MainWindow._occurredTags.Add(tag);
                        //tgitem.setname(translate(tag));
                        tgitem.setname(tag);
                        thistag.saveTag(tgitem);
                    }
                    keywords += tag;
                }
                MainWindow.insertintosolr(MainWindow._title, "", MainWindow.curUrl, MainWindow._postdate, keywords, 0, body);
                foreach (string tag in MainWindow._tags)
                {
                    Preview     b     = new Preview();
                    PreviewItem item3 = new PreviewItem();
                    //MainWindow.curtid = MainWindow.getTagNo(translate(tag));
                    MainWindow.curtid = MainWindow.getTagNo(tag);
                    item3.settid(MainWindow.curtid);
                    item3.setwid(MainWindow.curwid);
                    //item3.setpreview(translate((string)tagPreview[tag]));
                    item3.setpreview((string)MainWindow.tagPreview[tag]);
                    //item3.setpreview((string)tagPreview[tag]);
                    b.savePreview(item3);
                }
                /*suiyuhao*/
            }
        }