/// <summary> /// /// </summary> /// <param name="mediaWiki"></param> /// <returns></returns> public String[] Convert(mediawiki mediaWiki) { List <String> arParts = new List <string>(); if (mediaWiki == null) { return(null); } arParts.Add(mediaWiki.page.title); arParts.Add(String.Format("http://en.wikipedia.org/wiki/{0}", mediaWiki.page.title)); arParts.Add(mediaWiki.page.revision.text.Value); String text = mediaWiki.page.revision.text.Value; //Remove all the comments Regex regex = new Regex(@"<{1}.*?>{1}"); text = regex.Replace(text, this.MatchEvaluatorSimple); //Process only the Introduction to save time and space int firstHeaderIndex = text.IndexOf("=="); if (firstHeaderIndex != -1) { text = text.Substring(0, firstHeaderIndex); } //Remove all the special text regex = new Regex(@"{{([^{{])+?}}"); while (regex.IsMatch(text)) { text = regex.Replace(text, this.MatchEvaluatorSimple); } //Remove all the unneccesary text like "(, )" regex = new Regex(@"\(([^a..zA..Z0..9])+?\)"); while (regex.IsMatch(text)) { text = regex.Replace(text, this.MatchEvaluatorSimple); } //Simplify all the link text regex = new Regex(@"\[\[([^\|\]]+)(\|{0,1})([^\|\]]+)\]\](<nowiki>(.+)?</nowiki>){0,1}"); text = regex.Replace(text, this.MatchEvaluator); text = text.Replace("'''", ""); text = text.Replace("''", ""); text = text.Substring(0, text.Length > 500 ? 500 : text.Length - 1); text += "....."; arParts.Add(text); return(arParts.ToArray()); }
/// <summary> /// /// </summary> /// <param name="mediaWiki"></param> /// <returns></returns> public String[] Convert(mediawiki mediaWiki) { List<String> arParts = new List<string>(); if (mediaWiki == null) return null; arParts.Add(mediaWiki.page.title); arParts.Add(String.Format("http://en.wikipedia.org/wiki/{0}", mediaWiki.page.title)); arParts.Add(mediaWiki.page.revision.text.Value); String text = mediaWiki.page.revision.text.Value; //Remove all the comments Regex regex = new Regex(@"<{1}.*?>{1}"); text = regex.Replace(text, this.MatchEvaluatorSimple); //Process only the Introduction to save time and space int firstHeaderIndex = text.IndexOf("=="); if (firstHeaderIndex != -1) text = text.Substring(0, firstHeaderIndex); //Remove all the special text regex = new Regex(@"{{([^{{])+?}}"); while (regex.IsMatch(text)) text = regex.Replace(text, this.MatchEvaluatorSimple); //Remove all the unneccesary text like "(, )" regex = new Regex(@"\(([^a..zA..Z0..9])+?\)"); while (regex.IsMatch(text)) text = regex.Replace(text, this.MatchEvaluatorSimple); //Simplify all the link text regex = new Regex(@"\[\[([^\|\]]+)(\|{0,1})([^\|\]]+)\]\](<nowiki>(.+)?</nowiki>){0,1}"); text = regex.Replace(text, this.MatchEvaluator); text = text.Replace("'''", ""); text = text.Replace("''", ""); text = text.Substring(0, text.Length > 500 ? 500 : text.Length - 1); text += "....."; arParts.Add(text); return arParts.ToArray(); }