Пример #1
0
        /// <summary>
        /// 辅助函数:从HTML中获得用户信息
        /// </summary>
        /// <param name="currentUserHtml">包含微博用户信息的HTML文本</param>
        private void GetUserInfoFromHtml(string currentUserHtml)
        {
            //配置相关的过滤器
            HasAttributeFilter nickNameFilter = new HasAttributeFilter("class", "name");
            HasAttributeFilter remarkNameFilter = new HasAttributeFilter("class", "CH");
            HasAttributeFilter linkUrlFilter = new HasAttributeFilter("class", "pf_lin S_link1");
            HasAttributeFilter selfIntroFilter = new HasAttributeFilter("class", "pf_intro bsp");
            HasAttributeFilter tagsFilter = new HasAttributeFilter("class", "S_func1");
            HasAttributeFilter profileFilter = new HasAttributeFilter("class", "tags");

            Lexer lexer = new Lexer(currentUserHtml);
            Parser parser = new Parser(lexer);

            //获取微博名
            NodeList nickNameNodeList = parser.ExtractAllNodesThatMatch(nickNameFilter);

            if (nickNameNodeList.Size() == 1)
            {
                user.NickName = ((Span)nickNameNodeList[0]).ToPlainTextString();
            }
            else
            {
                Console.WriteLine("判断微博名的标准出错!");
            }
            //注意此处:如果要重复使用parser,一定要在本次使用“完”、下次使用前调用reset,否则会出错
            parser.Reset();
            //获取备注名称
            NodeList remarkNameNodeList = parser.ExtractAllNodesThatMatch(remarkNameFilter);

            if (remarkNameNodeList.Size() == 1 && remarkNameNodeList[0].GetType().Equals(typeof(Span)))
            {
                string str = ((Span)remarkNameNodeList[0]).ToPlainTextString();
                //去掉头尾的括号
                user.RemarkName = str.Substring(1, str.Length - 2);
            }
            else
            {
                Console.WriteLine("判断微博备注名称的标准出错!");
            }
            parser.Reset();
            //获取微博链接地址
            NodeList linkUrlNodeList = parser.ExtractAllNodesThatMatch(linkUrlFilter);
            if (linkUrlNodeList.Size() == 1 && linkUrlNodeList[0].GetType().Equals(typeof(ATag)))
            {
                user.LinkURL = ((ATag)linkUrlNodeList[0]).StringText;
            }
            else
            {
                Console.WriteLine("判断微博链接地址的标准出错!");
            }
            parser.Reset();
            //获取自我描述
            NodeList selfIntroNodeList = parser.ExtractAllNodesThatMatch(selfIntroFilter);
            if (selfIntroNodeList.Size() == 1 && selfIntroNodeList[0].Children[1].GetType().Equals(typeof(Span)))
            {
                user.SelfIntroduction = ((Span)selfIntroNodeList[0].Children[1]).GetAttribute("TITLE");
            }
            else
            {
                Console.WriteLine("判断自我描述的标准出错!");
            }
            parser.Reset();
            //获取标签
            NodeList tagsNodeList = parser.ExtractAllNodesThatMatch(tagsFilter);
            string str2 = "";
            for (int i = 0; i < tagsNodeList.Size(); i++)
            {
                if (tagsNodeList[i].GetType().Equals(typeof(Span)))
                {
                    str2 += ((Span)tagsNodeList[i]).ToPlainTextString() + " ";
                }
            }
            user.Tags = str2;
            parser.Reset();
            //获取属性信息
            NodeList profileNodeList = parser.ExtractAllNodesThatMatch(profileFilter);
            if (profileNodeList.Size() == 1)
            {
                //通过分析发现,有用的信息均处于<a>标记中,所以按<a>标记取。然后再分析是其中的文本还是<em>中的title
                NodeClassFilter aTagFilter = new NodeClassFilter(typeof(ATag));
                NodeList profileList = profileNodeList[0].Children.ExtractAllNodesThatMatch(aTagFilter, true);
                for (int j = 0; j < profileList.Size(); j++)
                {
                    ATag aTag = (ATag)profileList[j];
                    if (aTag.Attributes.Contains("TITLE"))
                    {
                        user.Profile += aTag.GetAttribute("TITLE") + " ";
                    }
                    else
                    {
                        //遇到含有node-type="infoSlide"的节点说明所有属性遍历结束
                        if (aTag.Attributes.Contains("NODE-TYPE") && aTag.GetAttribute("NODE-TYPE").Equals("infoSlide"))
                        {
                            break;
                        }
                        else
                        {
                            //包含<em>子节点的情况
                            if (aTag.Children[0].GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tagNode = (TagNode)aTag.Children[0];
                                user.Profile += tagNode.GetAttribute("TITLE") + " ";
                            }
                            else
                            {
                                //直接把<a>标记包含的文本输出
                                user.Profile += aTag.StringText + " ";
                            }
                        }
                    }
                }
            }
            else
            {
                Console.WriteLine("判断用户属性信息的标准出错!");
            }
        }
Пример #2
0
 static void GetSubtitleHtmlFromFile()
 {
     List<List<NodeFilter>> filters = new List<List<NodeFilter>>();
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters.Add(new List<NodeFilter>());
     filters[0].Add(new HasAttributeFilter("class", "xl29"));
     filters[0].Add(new HasAttributeFilter("class", "xl31"));
     filters[0].Add(new HasAttributeFilter("class", "xl32"));
     filters[0].Add(new HasAttributeFilter("class", "xl33"));
     filters[1].Add(new HasAttributeFilter("class", "xl25"));
     filters[1].Add(new HasAttributeFilter("class", "xl26"));
     filters[1].Add(new HasAttributeFilter("class", "xl27"));
     filters[1].Add(new HasAttributeFilter("class", "xl28"));
     filters[2].Add(new HasAttributeFilter("class", "xl27"));
     filters[2].Add(new HasAttributeFilter("class", "xl28"));
     filters[2].Add(new HasAttributeFilter("class", "xl29"));
     filters[2].Add(new HasAttributeFilter("class", "xl30"));
     filters[3].Add(new HasAttributeFilter("class", "xl27"));
     filters[3].Add(new HasAttributeFilter("class", "xl28"));
     filters[3].Add(new HasAttributeFilter("class", "xl29"));
     filters[3].Add(new OrFilter(new HasAttributeFilter("class", "xl31"), new HasAttributeFilter("class", "xl30")));
     filters[4].Add(new HasAttributeFilter("class", "xl27"));
     filters[4].Add(new HasAttributeFilter("class", "xl28"));
     filters[4].Add(new HasAttributeFilter("class", "xl29"));
     filters[4].Add(new HasAttributeFilter("class", "xl30"));
     filters[5].Add(new HasAttributeFilter("class", "xl33"));
     filters[5].Add(new HasAttributeFilter("class", "xl32"));
     filters[5].Add(new HasAttributeFilter("class", "xl30"));
     filters[5].Add(new HasAttributeFilter("class", "xl28"));
     filters[6].Add(new HasAttributeFilter("class", "xl29"));
     filters[6].Add(new HasAttributeFilter("class", "xl30"));
     filters[6].Add(new HasAttributeFilter("class", "xl31"));
     filters[6].Add(new HasAttributeFilter("class", "xl32"));
     filters[7].Add(new HasAttributeFilter("class", "xl28"));
     filters[7].Add(new HasAttributeFilter("class", "xl24"));
     filters[7].Add(new HasAttributeFilter("class", "xl30"));
     filters[7].Add(new HasAttributeFilter("class", "xl31"));
     DirectoryInfo directory = new DirectoryInfo(@"D:\Download\魔戒三部曲电影导演评论字幕\mht");
     int count = 0;
     foreach (FileInfo file in directory.GetFiles("*.htm"))
     {
         StreamReader reader = new StreamReader(file.FullName);
         string htmlContent = reader.ReadToEnd();
         reader.Close();
         string fileName = file.Name.Substring(0, file.Name.IndexOf('.'));
         Lexer lexer = new Lexer(htmlContent);
         Parser parser = new Parser(lexer);
         //红色的是演员解说
         NodeList redNodeList = parser.ExtractAllNodesThatMatch(filters[count][0]);
         GetSubtitleFromHtml(redNodeList, fileName + "_演员解说");
         //黄色的是导演编剧解说
         parser.Reset();
         NodeList yelloNodeList = parser.ExtractAllNodesThatMatch(filters[count][1]);
         GetSubtitleFromHtml(yelloNodeList, fileName + "_导演编剧解说");
         //蓝色的是特技制作组
         parser.Reset();
         NodeList blueNodeList = parser.ExtractAllNodesThatMatch(filters[count][2]);
         GetSubtitleFromHtml(blueNodeList, fileName + "_特技制作组解说");
         //绿色的是幕后制作团队
         parser.Reset();
         NodeList greenNodeList = parser.ExtractAllNodesThatMatch(filters[count][3]);
         GetSubtitleFromHtml(greenNodeList, fileName + "_幕后制作团队解说");
         count++;
     }
 }