private void CreateTerms() { if (!Directory.Exists(Environment.CurrentDirectory + @"\xml\")) { Directory.CreateDirectory(Environment.CurrentDirectory + @"\xml\"); } string outfile = Environment.CurrentDirectory + $@"\xml\{txtAsin.Text}.entities.xml"; Terms.Clear(); var termId = 1; foreach (DataGridViewRow row in dgvTerms.Rows) { XRay.Term newTerm = new XRay.Term { Id = termId++, Type = CompareImages((Bitmap)row.Cells[0].Value, Resources.character) ? "character" : "topic", TermName = row.Cells[1].Value.ToString(), Aliases = row.Cells[2].Value.ToString() != "" ? row.Cells[2].Value.ToString().Split(',').Distinct().ToList() : new List <string>(), Desc = row.Cells[3].Value.ToString(), DescUrl = row.Cells[4].Value.ToString(), DescSrc = row.Cells[5].Value.ToString(), Match = (bool)row.Cells[6].Value, MatchCase = (bool)row.Cells[7].Value, RegexAliases = (bool)row.Cells[9].Value }; Terms.Add(newTerm); } Functions.Save(Terms, outfile); }
/// <summary> /// Extract terms from the given db. /// </summary> /// <param name="xrayDb">Connection to any db containing the proper dataset.</param> /// <param name="singleUse">If set, will close the connection when complete.</param> public static IEnumerable <XRay.Term> ExtractTermsNew(DbConnection xrayDb, bool singleUse) { if (xrayDb.State != ConnectionState.Open) { xrayDb.Open(); } var command = xrayDb.CreateCommand(); command.CommandText = "SELECT entity.id,entity.label,entity.type,entity.count,entity_description.text,string.text as sourcetxt FROM entity" + " LEFT JOIN entity_description ON entity.id = entity_description.entity" + " LEFT JOIN source ON entity_description.source = source.id" + " LEFT JOIN string ON source.label = string.id AND string.language = 'en'" + " WHERE entity.has_info_card = '1'"; var reader = command.ExecuteReader(); while (reader.Read()) { var newTerm = new XRay.Term { Id = Convert.ToInt32(reader["id"]), TermName = (string)reader["label"], Type = Convert.ToInt32(reader["type"]) == 1 ? "character" : "topic", Desc = (string)reader["text"], DescSrc = reader["sourcetxt"] == DBNull.Value ? "" : (string)reader["sourcetxt"] }; // Real locations aren't needed for extracting terms for preview or XML saving, but need count var i = Convert.ToInt32(reader["count"]); for (; i > 0; i--) { newTerm.Locs.Add(null); } // TODO: Should probably also confirm whether this URL exists or not if (newTerm.DescSrc == "Wikipedia") { newTerm.DescUrl = string.Format(@"http://en.wikipedia.org/wiki/{0}", newTerm.TermName.Replace(" ", "_")); } yield return(newTerm); } command.Dispose(); if (singleUse) { xrayDb.Close(); } }
private List <T> LoadTermsFromTxt <T>(string txtfile) { List <T> itemList = new List <T>(); using (StreamReader streamReader = new StreamReader(txtfile, Encoding.UTF8)) { int termId = 1; int lineCount = 1; Terms.Clear(); while (!streamReader.EndOfStream) { try { string temp = streamReader.ReadLine()?.ToLower(); if (string.IsNullOrEmpty(temp)) { continue; } lineCount++; if (temp != "character" && temp != "topic") { MessageBox.Show("Error: Invalid term type \"" + temp + "\" on line " + lineCount); return(null); } XRay.Term newTerm = new XRay.Term { Type = temp, TermName = streamReader.ReadLine(), Desc = streamReader.ReadLine() }; lineCount += 2; newTerm.MatchCase = temp == "character"; newTerm.DescSrc = "shelfari"; newTerm.Id = termId++; Terms.Add(newTerm); } catch (Exception ex) { MessageBox.Show("An error occurred reading from txt file: " + ex.Message + "\r\n" + ex.StackTrace); return(null); } } } return(itemList); }
// Are there actually any goodreads pages that aren't at goodreads.com for other languages?? private async Task <XRay.Term> GetTerm(string baseUrl, string relativeUrl) { XRay.Term result = new XRay.Term("character"); Uri tempUri = new Uri(baseUrl); tempUri = new Uri(new Uri(tempUri.GetLeftPart(UriPartial.Authority)), relativeUrl); result.DescSrc = "Goodreads"; result.DescUrl = tempUri.ToString(); HtmlDocument charDoc = new HtmlDocument(); charDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(tempUri.ToString())); HtmlNode mainNode = charDoc.DocumentNode.SelectSingleNode("//div[@class='mainContentFloat']") ?? charDoc.DocumentNode.SelectSingleNode("//div[@class='mainContentFloat ']"); result.TermName = mainNode.SelectSingleNode("./h1").InnerText; mainNode = mainNode.SelectSingleNode("//div[@class='grey500BoxContent']"); HtmlNodeCollection tempNodes = mainNode.SelectNodes("//div[@class='floatingBox']"); if (tempNodes == null) { return(result); } foreach (HtmlNode tempNode in tempNodes) { if (tempNode.Id.Contains("_aliases")) // If present, add any aliases found { string aliasStr = tempNode.InnerText.Replace("[close]", "").Trim(); result.Aliases.AddRange(aliasStr.Split(new [] { ", " }, StringSplitOptions.RemoveEmptyEntries)); } else { result.Desc = tempNode.InnerText.Replace("[close]", "").Trim(); } } return(result); }
public override async Task <List <XRay.Term> > GetTerms(string dataUrl, IProgressBar progress, CancellationToken token) { Logger.Log("Downloading Shelfari page..."); List <XRay.Term> terms = new List <XRay.Term>(); if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(dataUrl)); } //Constants for wiki processing Dictionary <string, string> sections = new Dictionary <string, string> { { "WikiModule_Characters", "character" }, { "WikiModule_Organizations", "topic" }, { "WikiModule_Settings", "topic" }, { "WikiModule_Glossary", "topic" } }; foreach (string header in sections.Keys) { HtmlNodeCollection characterNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@id='" + header + "']//ul[@class='li_6']/li"); if (characterNodes == null) { continue; //Skip section if not found on page } foreach (HtmlNode li in characterNodes) { string tmpString = li.InnerText; XRay.Term newTerm = new XRay.Term(sections[header]); //Create term as either character/topic if (tmpString.Contains(":")) { newTerm.TermName = tmpString.Substring(0, tmpString.IndexOf(":")); newTerm.Desc = tmpString.Substring(tmpString.IndexOf(":") + 1).Replace("&", "&").Trim(); } else { newTerm.TermName = tmpString; } newTerm.DescSrc = "shelfari"; //Use either the associated shelfari URL of the term or if none exists, use the book's url newTerm.DescUrl = (li.InnerHtml.IndexOf("<a href") == 0 ? li.InnerHtml.Substring(9, li.InnerHtml.IndexOf("\"", 9) - 9) : dataUrl); if (header == "WikiModule_Glossary") { newTerm.MatchCase = false; } //Default glossary terms to be case insensitive when searching through book if (terms.Select(t => t.TermName).Contains(newTerm.TermName)) { Logger.Log("Duplicate term \"" + newTerm.TermName + "\" found. Ignoring this duplicate."); } else { terms.Add(newTerm); } } } return(terms); }