Exemple #1
0
    /// <summary>
    /// 获得符合规则的行数据
    /// </summary>
    /// <param name="root"></param>
    /// <param name="rule"></param>
    /// <returns></returns>
    public static List <List <CellInfo> > GetMultiRowsByContentRule(HTMLEngine.MyRootHtmlNode root, TableSearchContentRule rule)
    {
        var Container = new List <List <CellInfo> >();

        for (int tableNo = 1; tableNo <= root.TableList.Count; tableNo++)
        {
            var table     = new HTMLTable(root.TableList[tableNo]);
            var RowHeader = table.GetRow(1);
            for (int RowNo = 1; RowNo < table.RowCount; RowNo++)
            {
                var row     = table.GetRow(RowNo);
                var IsMatch = false;
                foreach (var cell in row)
                {
                    if (rule.Content != null)
                    {
                        foreach (var content in rule.Content)
                        {
                            if (rule.IsContentEq)
                            {
                                //相等模式
                                if (content.Equals(cell.Replace(" ", "")))
                                {
                                    Container.Add(ConvertRowToCellInfo(row, tableNo, RowNo, RowHeader));
                                    IsMatch = true;
                                    break;
                                }
                            }
                            else
                            {
                                //包含模式
                                if (content.Contains(cell.Replace(" ", "")))
                                {
                                    Container.Add(ConvertRowToCellInfo(row, tableNo, RowNo, RowHeader));
                                    IsMatch = true;
                                    break;
                                }
                            }
                        }
                    }
                    if (IsMatch)
                    {
                        break;
                    }
                }
            }
        }
        return(Container);
    }
Exemple #2
0
 /// <summary>
 /// 寻找含有关键字的列的表头
 /// </summary>
 /// <param name="root"></param>
 /// <param name="KeyWord"></param>
 public void PutTitleTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord)
 {
     if (root.TableList == null)
     {
         return;
     }
     foreach (var Table in root.TableList)
     {
         var t = new HTMLTable(Table.Value);
         for (int RowNo = 2; RowNo < t.RowCount; RowNo++)
         {
             //从第二行开始
             for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++)
             {
                 var title = t.CellValue(1, ColNo).Replace(" ", "");
                 if (String.IsNullOrEmpty(title))
                 {
                     continue;
                 }
                 var value = t.CellValue(RowNo, ColNo);
                 if (Transform != null)
                 {
                     value = Transform(value, title);
                 }
                 if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult()))
                 {
                     if (!TrainingTitleResult.ContainsKey(title))
                     {
                         TrainingTitleResult.Add(title, 1);
                     }
                     else
                     {
                         TrainingTitleResult[title]++;
                     }
                     var Whole = String.Join(",", t.GetRow());
                     if (!WholeHeaderRow.Contains(Whole))
                     {
                         WholeHeaderRow.Add(Whole);
                     }
                 }
             }
         }
     }
 }
        public bool FindAlbuminfo(string strAlbum, string artistName, int releaseYear)
        {
            _albumList.Clear();

//     strAlbum="1999";//escapolygy";

            // make request
            // type is
            // http://www.allmusic.com/cg/amg.dll?P=amg&SQL=escapolygy&OPT1=2

            HTMLUtil util     = new HTMLUtil();
            string   postData = String.Format("P=amg&SQL={0}&OPT1=2", HttpUtility.UrlEncode(strAlbum));

            string html = PostHTTP("http://www.allmusic.com/cg/amg.dll", postData);

            if (html.Length == 0)
            {
                return(false);
            }

            // check if this is an album
            MusicAlbumInfo newAlbum = new MusicAlbumInfo();

            newAlbum.AlbumURL = "http://www.allmusic.com/cg/amg.dll?" + postData;
            if (newAlbum.Parse(html))
            {
                _albumList.Add(newAlbum);
                return(true);
            }

            string htmlLow = html;

            htmlLow = htmlLow.ToLower();
            int startOfTable = htmlLow.IndexOf("id=\"expansiontable1\"");

            if (startOfTable < 0)
            {
                return(false);
            }
            startOfTable = htmlLow.LastIndexOf("<table", startOfTable);
            if (startOfTable < 0)
            {
                return(false);
            }

            HTMLTable table    = new HTMLTable();
            string    strTable = html.Substring(startOfTable);

            table.Parse(strTable);

            for (int i = 1; i < table.Rows; ++i)
            {
                HTMLTable.HTMLRow row          = table.GetRow(i);
                string            albumName    = "";
                string            albumUrl     = "";
                string            nameOfAlbum  = "";
                string            nameOfArtist = "";
                for (int iCol = 0; iCol < row.Columns; ++iCol)
                {
                    string column = row.GetColumValue(iCol);
                    if (iCol == 1 && (column.Length != 0))
                    {
                        albumName = "(" + column + ")";
                    }
                    if (iCol == 2)
                    {
                        nameOfArtist = column;
                        util.RemoveTags(ref nameOfArtist);
                        if (!column.Equals("&nbsp;"))
                        {
                            albumName = String.Format("- {0} {1}", nameOfArtist, albumName);
                        }
                    }
                    if (iCol == 4)
                    {
                        string tempAlbum = column;
                        util.RemoveTags(ref tempAlbum);
                        albumName   = String.Format("{0} {1}", tempAlbum, albumName);
                        nameOfAlbum = tempAlbum;
                    }
                    if (iCol == 4 && column.IndexOf("<a href=\"") >= 0)
                    {
                        int pos1 = column.IndexOf("<a href=\"");
                        pos1 += +"<a href=\"".Length;
                        int iPos2 = column.IndexOf("\">", pos1);
                        if (iPos2 >= 0)
                        {
                            if (nameOfAlbum.Length == 0)
                            {
                                nameOfAlbum = albumName;
                            }

                            // full album url:
                            // http://www.allmusic.com/cg/amg.dll?p=amg&token=&sql=10:66jieal64xs7
                            string url = column.Substring(pos1, iPos2 - pos1);
                            string albumNameStripped;
                            albumUrl = String.Format("http://www.allmusic.com{0}", url);
                            MusicAlbumInfo newAlbumInfo = new MusicAlbumInfo();
                            util.ConvertHTMLToAnsi(albumName, out albumNameStripped);
                            newAlbumInfo.Title2   = albumNameStripped;
                            newAlbumInfo.AlbumURL = util.ConvertHTMLToAnsi(albumUrl);
                            newAlbumInfo.Artist   = util.ConvertHTMLToAnsi(nameOfArtist);
                            newAlbumInfo.Title    = util.ConvertHTMLToAnsi(nameOfAlbum);
                            _albumList.Add(newAlbumInfo);
                        }
                    }
                }
            }

            // now sort
            _albumList.Sort(new AlbumSort(strAlbum, artistName, releaseYear));
            return(true);
        }
Exemple #4
0
    /// <summary>
    /// 标题优先度
    /// </summary>
    /// <param name="root"></param>
    /// <param name="Rules"></param>
    /// <param name="IsMeger"></param>
    /// <returns></returns>
    public static List <CellInfo[]> GetMultiInfoByTitleRules(HTMLEngine.MyRootHtmlNode root, List <TableSearchTitleRule> Rules, bool IsMeger)
    {
        var Container = new List <CellInfo[]>();

        for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++)
        {
            var      table = new HTMLTable(root.TableList[tableIndex + 1]);
            var      checkResultColumnNo = new int[Rules.Count];
            var      checkResultTitle    = new string[Rules.Count];
            var      HeaderRowNo         = -1;
            String[] HeaderRow           = null;
            var      IsFirstRowOneCell   = false; //第一行是否为整行合并
            for (int TestRowHeader = 1; TestRowHeader < table.RowCount; TestRowHeader++)
            {
                checkResultColumnNo = new int[Rules.Count];
                var IsOneColumnRow = true;  //是否整行合并
                for (int i = 2; i <= table.ColumnCount; i++)
                {
                    if (table.CellValue(TestRowHeader, i) != (table.CellValue(TestRowHeader, 1)))
                    {
                        IsOneColumnRow = false;
                        break;
                    }
                }
                if (IsOneColumnRow)
                {
                    if (TestRowHeader == 1)
                    {
                        IsFirstRowOneCell = true;
                    }
                    continue;
                }
                HeaderRow = table.GetRow(TestRowHeader);
                for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++)
                {
                    foreach (var EvaluateTitle in Rules[checkItemIdx].Title)
                    {
                        //根据标题优先度检索,对每个标题单独检索
                        for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++)
                        {
                            //在每个行首单元格检索
                            //标题的处理
                            if (Rules[checkItemIdx].IsTitleEq)
                            {
                                //相等模式
                                if (!EvaluateTitle.Equals(HeaderRow[ColIndex].Replace(" ", "")))
                                {
                                    continue;
                                }
                                if (Rules[checkItemIdx].ExcludeTitle != null)
                                {
                                    var isOK = true;
                                    foreach (var word in Rules[checkItemIdx].ExcludeTitle)
                                    {
                                        if (HeaderRow[ColIndex].Contains(word))
                                        {
                                            isOK = false;
                                            break;
                                        }
                                    }
                                    if (!isOK)
                                    {
                                        continue;
                                    }
                                }
                            }
                            else
                            {
                                //包含模式
                                if (!HeaderRow[ColIndex].Replace(" ", "").Contains(EvaluateTitle))
                                {
                                    continue;
                                }
                                if (Rules[checkItemIdx].ExcludeTitle != null)
                                {
                                    var isOK = true;
                                    foreach (var word in Rules[checkItemIdx].ExcludeTitle)
                                    {
                                        if (HeaderRow[ColIndex].Contains(word))
                                        {
                                            isOK = false;
                                            break;
                                        }
                                    }
                                    if (!isOK)
                                    {
                                        continue;
                                    }
                                }
                            }

                            //父标题的处理
                            if (Rules[checkItemIdx].SuperTitle != null && Rules[checkItemIdx].SuperTitle.Count != 0)
                            {
                                //具有父标题的情况
                                var IsFoundSuperTitle = false;
                                for (int superRowNo = 1; superRowNo < TestRowHeader; superRowNo++)
                                {
                                    var value = table.CellValue(superRowNo, ColIndex + 1).Replace(" ", "");
                                    if (Rules[checkItemIdx].IsSuperTitleEq)
                                    {
                                        //等于
                                        if (Rules[checkItemIdx].SuperTitle.Contains(value))
                                        {
                                            IsFoundSuperTitle = true;
                                            break;
                                        }
                                    }
                                    else
                                    {
                                        //包含
                                        foreach (var supertitle in Rules[checkItemIdx].SuperTitle)
                                        {
                                            if (value.Contains(supertitle))
                                            {
                                                IsFoundSuperTitle = true;
                                                break;
                                            }
                                        }
                                    }
                                    if (IsFoundSuperTitle)
                                    {
                                        break;
                                    }
                                }
                                if (!IsFoundSuperTitle)
                                {
                                    continue;
                                }
                            }
                            checkResultTitle[checkItemIdx]    = HeaderRow[ColIndex];
                            checkResultColumnNo[checkItemIdx] = ColIndex + 1;
                            break;
                        }
                        if (!String.IsNullOrEmpty(checkResultTitle[checkItemIdx]))
                        {
                            break;
                        }
                    }
                    //主字段没有找到,其他不用找了
                    if (checkResultColumnNo[0] == 0)
                    {
                        break;
                    }
                }

                bool IsAllRequiredItemOK = true;
                for (int checkItemIdx = 0; checkItemIdx < checkResultColumnNo.Length; checkItemIdx++)
                {
                    if (checkResultColumnNo[checkItemIdx] == 0 && Rules[checkItemIdx].IsRequire)
                    {
                        IsAllRequiredItemOK = false;
                        break;
                    }
                }

                if (IsAllRequiredItemOK)
                {
                    if (TestRowHeader == 1 || IsFirstRowOneCell)
                    {
                        HeaderRowNo = TestRowHeader;
                        break;
                    }
                    else
                    {
                        //对于标题栏非首行的情况,如果不是首行是一个大的整行合并单元格,则做严格检查
                        //进行严格的检查,暂时要求全匹配
                        var IsOK = true;
                        for (int i = 0; i < Rules.Count; i++)
                        {
                            if (checkResultColumnNo[i] == 0)
                            {
                                IsOK = false;
                                break;
                            }
                        }
                        if (IsOK)
                        {
                            HeaderRowNo = TestRowHeader;
                            break;
                        }
                    }
                }
            }

            //主字段没有找到,下一张表
            if (HeaderRowNo == -1)
            {
                continue;
            }

            for (int RowNo = HeaderRowNo; RowNo <= table.RowCount; RowNo++)
            {
                if (RowNo == HeaderRowNo)
                {
                    continue;
                }
                if (table.IsTotalRow(RowNo))
                {
                    continue;                                                //非合计行
                }
                var target = table.CellValue(RowNo, checkResultColumnNo[0]); //主字段非空
                if (target == String.Empty || target == strRowSpanValue || target == strColSpanValue || target == strNullValue)
                {
                    continue;
                }
                if (Rules[0].Title.Contains(target))
                {
                    continue;
                }

                var RowData = new CellInfo[Rules.Count];
                for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++)
                {
                    if (checkResultColumnNo[checkItemIdx] == 0)
                    {
                        continue;
                    }
                    var ColNo = checkResultColumnNo[checkItemIdx];
                    RowData[checkItemIdx].TableId = tableIndex + 1;
                    RowData[checkItemIdx].Row     = RowNo;
                    RowData[checkItemIdx].Column  = ColNo;
                    RowData[checkItemIdx].Title   = checkResultTitle[checkItemIdx];
                    if (table.CellValue(RowNo, ColNo).Equals(strNullValue))
                    {
                        continue;
                    }
                    RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo);
                    if (Rules[checkItemIdx].Normalize != null)
                    {
                        RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]);
                    }
                }

                var HasSame = false;
                foreach (var existRow in Container)
                {
                    if (IsSameContent(existRow, RowData))
                    {
                        HasSame = true;
                        break;
                    }
                }
                if (!HasSame)
                {
                    Container.Add(RowData);
                }
            }
        }
        if (IsMeger)
        {
            Container = MergerMultiInfo(Container);
        }
        return(Container);
    }