/// <summary> /// 获得符合规则的行数据 /// </summary> /// <param name="root"></param> /// <param name="rule"></param> /// <returns></returns> public static List <List <CellInfo> > GetMultiRowsByContentRule(HTMLEngine.MyRootHtmlNode root, TableSearchContentRule rule) { var Container = new List <List <CellInfo> >(); for (int tableNo = 1; tableNo <= root.TableList.Count; tableNo++) { var table = new HTMLTable(root.TableList[tableNo]); var RowHeader = table.GetRow(1); for (int RowNo = 1; RowNo < table.RowCount; RowNo++) { var row = table.GetRow(RowNo); var IsMatch = false; foreach (var cell in row) { if (rule.Content != null) { foreach (var content in rule.Content) { if (rule.IsContentEq) { //相等模式 if (content.Equals(cell.Replace(" ", ""))) { Container.Add(ConvertRowToCellInfo(row, tableNo, RowNo, RowHeader)); IsMatch = true; break; } } else { //包含模式 if (content.Contains(cell.Replace(" ", ""))) { Container.Add(ConvertRowToCellInfo(row, tableNo, RowNo, RowHeader)); IsMatch = true; break; } } } } if (IsMatch) { break; } } } } return(Container); }
/// <summary> /// 寻找含有关键字的列的表头 /// </summary> /// <param name="root"></param> /// <param name="KeyWord"></param> public void PutTitleTrainingItem(HTMLEngine.MyRootHtmlNode root, string KeyWord) { if (root.TableList == null) { return; } foreach (var Table in root.TableList) { var t = new HTMLTable(Table.Value); for (int RowNo = 2; RowNo < t.RowCount; RowNo++) { //从第二行开始 for (int ColNo = 1; ColNo < t.ColumnCount; ColNo++) { var title = t.CellValue(1, ColNo).Replace(" ", ""); if (String.IsNullOrEmpty(title)) { continue; } var value = t.CellValue(RowNo, ColNo); if (Transform != null) { value = Transform(value, title); } if (value.NormalizeTextResult().Equals(KeyWord.NormalizeTextResult())) { if (!TrainingTitleResult.ContainsKey(title)) { TrainingTitleResult.Add(title, 1); } else { TrainingTitleResult[title]++; } var Whole = String.Join(",", t.GetRow()); if (!WholeHeaderRow.Contains(Whole)) { WholeHeaderRow.Add(Whole); } } } } } }
public bool FindAlbuminfo(string strAlbum, string artistName, int releaseYear) { _albumList.Clear(); // strAlbum="1999";//escapolygy"; // make request // type is // http://www.allmusic.com/cg/amg.dll?P=amg&SQL=escapolygy&OPT1=2 HTMLUtil util = new HTMLUtil(); string postData = String.Format("P=amg&SQL={0}&OPT1=2", HttpUtility.UrlEncode(strAlbum)); string html = PostHTTP("http://www.allmusic.com/cg/amg.dll", postData); if (html.Length == 0) { return(false); } // check if this is an album MusicAlbumInfo newAlbum = new MusicAlbumInfo(); newAlbum.AlbumURL = "http://www.allmusic.com/cg/amg.dll?" + postData; if (newAlbum.Parse(html)) { _albumList.Add(newAlbum); return(true); } string htmlLow = html; htmlLow = htmlLow.ToLower(); int startOfTable = htmlLow.IndexOf("id=\"expansiontable1\""); if (startOfTable < 0) { return(false); } startOfTable = htmlLow.LastIndexOf("<table", startOfTable); if (startOfTable < 0) { return(false); } HTMLTable table = new HTMLTable(); string strTable = html.Substring(startOfTable); table.Parse(strTable); for (int i = 1; i < table.Rows; ++i) { HTMLTable.HTMLRow row = table.GetRow(i); string albumName = ""; string albumUrl = ""; string nameOfAlbum = ""; string nameOfArtist = ""; for (int iCol = 0; iCol < row.Columns; ++iCol) { string column = row.GetColumValue(iCol); if (iCol == 1 && (column.Length != 0)) { albumName = "(" + column + ")"; } if (iCol == 2) { nameOfArtist = column; util.RemoveTags(ref nameOfArtist); if (!column.Equals(" ")) { albumName = String.Format("- {0} {1}", nameOfArtist, albumName); } } if (iCol == 4) { string tempAlbum = column; util.RemoveTags(ref tempAlbum); albumName = String.Format("{0} {1}", tempAlbum, albumName); nameOfAlbum = tempAlbum; } if (iCol == 4 && column.IndexOf("<a href=\"") >= 0) { int pos1 = column.IndexOf("<a href=\""); pos1 += +"<a href=\"".Length; int iPos2 = column.IndexOf("\">", pos1); if (iPos2 >= 0) { if (nameOfAlbum.Length == 0) { nameOfAlbum = albumName; } // full album url: // http://www.allmusic.com/cg/amg.dll?p=amg&token=&sql=10:66jieal64xs7 string url = column.Substring(pos1, iPos2 - pos1); string albumNameStripped; albumUrl = String.Format("http://www.allmusic.com{0}", url); MusicAlbumInfo newAlbumInfo = new MusicAlbumInfo(); util.ConvertHTMLToAnsi(albumName, out albumNameStripped); newAlbumInfo.Title2 = albumNameStripped; newAlbumInfo.AlbumURL = util.ConvertHTMLToAnsi(albumUrl); newAlbumInfo.Artist = util.ConvertHTMLToAnsi(nameOfArtist); newAlbumInfo.Title = util.ConvertHTMLToAnsi(nameOfAlbum); _albumList.Add(newAlbumInfo); } } } } // now sort _albumList.Sort(new AlbumSort(strAlbum, artistName, releaseYear)); return(true); }
/// <summary> /// 标题优先度 /// </summary> /// <param name="root"></param> /// <param name="Rules"></param> /// <param name="IsMeger"></param> /// <returns></returns> public static List <CellInfo[]> GetMultiInfoByTitleRules(HTMLEngine.MyRootHtmlNode root, List <TableSearchTitleRule> Rules, bool IsMeger) { var Container = new List <CellInfo[]>(); for (int tableIndex = 0; tableIndex < root.TableList.Count; tableIndex++) { var table = new HTMLTable(root.TableList[tableIndex + 1]); var checkResultColumnNo = new int[Rules.Count]; var checkResultTitle = new string[Rules.Count]; var HeaderRowNo = -1; String[] HeaderRow = null; var IsFirstRowOneCell = false; //第一行是否为整行合并 for (int TestRowHeader = 1; TestRowHeader < table.RowCount; TestRowHeader++) { checkResultColumnNo = new int[Rules.Count]; var IsOneColumnRow = true; //是否整行合并 for (int i = 2; i <= table.ColumnCount; i++) { if (table.CellValue(TestRowHeader, i) != (table.CellValue(TestRowHeader, 1))) { IsOneColumnRow = false; break; } } if (IsOneColumnRow) { if (TestRowHeader == 1) { IsFirstRowOneCell = true; } continue; } HeaderRow = table.GetRow(TestRowHeader); for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { foreach (var EvaluateTitle in Rules[checkItemIdx].Title) { //根据标题优先度检索,对每个标题单独检索 for (int ColIndex = 0; ColIndex < HeaderRow.Length; ColIndex++) { //在每个行首单元格检索 //标题的处理 if (Rules[checkItemIdx].IsTitleEq) { //相等模式 if (!EvaluateTitle.Equals(HeaderRow[ColIndex].Replace(" ", ""))) { continue; } if (Rules[checkItemIdx].ExcludeTitle != null) { var isOK = true; foreach (var word in Rules[checkItemIdx].ExcludeTitle) { if (HeaderRow[ColIndex].Contains(word)) { isOK = false; break; } } if (!isOK) { continue; } } } else { //包含模式 if (!HeaderRow[ColIndex].Replace(" ", "").Contains(EvaluateTitle)) { continue; } if (Rules[checkItemIdx].ExcludeTitle != null) { var isOK = true; foreach (var word in Rules[checkItemIdx].ExcludeTitle) { if (HeaderRow[ColIndex].Contains(word)) { isOK = false; break; } } if (!isOK) { continue; } } } //父标题的处理 if (Rules[checkItemIdx].SuperTitle != null && Rules[checkItemIdx].SuperTitle.Count != 0) { //具有父标题的情况 var IsFoundSuperTitle = false; for (int superRowNo = 1; superRowNo < TestRowHeader; superRowNo++) { var value = table.CellValue(superRowNo, ColIndex + 1).Replace(" ", ""); if (Rules[checkItemIdx].IsSuperTitleEq) { //等于 if (Rules[checkItemIdx].SuperTitle.Contains(value)) { IsFoundSuperTitle = true; break; } } else { //包含 foreach (var supertitle in Rules[checkItemIdx].SuperTitle) { if (value.Contains(supertitle)) { IsFoundSuperTitle = true; break; } } } if (IsFoundSuperTitle) { break; } } if (!IsFoundSuperTitle) { continue; } } checkResultTitle[checkItemIdx] = HeaderRow[ColIndex]; checkResultColumnNo[checkItemIdx] = ColIndex + 1; break; } if (!String.IsNullOrEmpty(checkResultTitle[checkItemIdx])) { break; } } //主字段没有找到,其他不用找了 if (checkResultColumnNo[0] == 0) { break; } } bool IsAllRequiredItemOK = true; for (int checkItemIdx = 0; checkItemIdx < checkResultColumnNo.Length; checkItemIdx++) { if (checkResultColumnNo[checkItemIdx] == 0 && Rules[checkItemIdx].IsRequire) { IsAllRequiredItemOK = false; break; } } if (IsAllRequiredItemOK) { if (TestRowHeader == 1 || IsFirstRowOneCell) { HeaderRowNo = TestRowHeader; break; } else { //对于标题栏非首行的情况,如果不是首行是一个大的整行合并单元格,则做严格检查 //进行严格的检查,暂时要求全匹配 var IsOK = true; for (int i = 0; i < Rules.Count; i++) { if (checkResultColumnNo[i] == 0) { IsOK = false; break; } } if (IsOK) { HeaderRowNo = TestRowHeader; break; } } } } //主字段没有找到,下一张表 if (HeaderRowNo == -1) { continue; } for (int RowNo = HeaderRowNo; RowNo <= table.RowCount; RowNo++) { if (RowNo == HeaderRowNo) { continue; } if (table.IsTotalRow(RowNo)) { continue; //非合计行 } var target = table.CellValue(RowNo, checkResultColumnNo[0]); //主字段非空 if (target == String.Empty || target == strRowSpanValue || target == strColSpanValue || target == strNullValue) { continue; } if (Rules[0].Title.Contains(target)) { continue; } var RowData = new CellInfo[Rules.Count]; for (int checkItemIdx = 0; checkItemIdx < Rules.Count; checkItemIdx++) { if (checkResultColumnNo[checkItemIdx] == 0) { continue; } var ColNo = checkResultColumnNo[checkItemIdx]; RowData[checkItemIdx].TableId = tableIndex + 1; RowData[checkItemIdx].Row = RowNo; RowData[checkItemIdx].Column = ColNo; RowData[checkItemIdx].Title = checkResultTitle[checkItemIdx]; if (table.CellValue(RowNo, ColNo).Equals(strNullValue)) { continue; } RowData[checkItemIdx].RawData = table.CellValue(RowNo, ColNo); if (Rules[checkItemIdx].Normalize != null) { RowData[checkItemIdx].RawData = Rules[checkItemIdx].Normalize(RowData[checkItemIdx].RawData, HeaderRow[ColNo - 1]); } } var HasSame = false; foreach (var existRow in Container) { if (IsSameContent(existRow, RowData)) { HasSame = true; break; } } if (!HasSame) { Container.Add(RowData); } } } if (IsMeger) { Container = MergerMultiInfo(Container); } return(Container); }