/// <summary> /// An article with given HTML extractor, title, HTML text and article type /// </summary> /// <param name="htmlPlainTextExtractor">The specified HTML extractor</param> /// <param name="title">Title of article</param> /// <param name="htmlText">HTML text of article</param> /// <param name="titlePath">File path of title</param> /// <param name="summaryPath">File path of summary</param> /// <param name="typePath">File path of type</param> /// <param name="type">Type of article</param> public Article(IHtmlPlainTextExtractor htmlPlainTextExtractor, string title, string htmlText, string titlePath, string summaryPath, string typePath, ArticleType type = ArticleType.NotSpecified) { this.htmlPlainTextExtractor = htmlPlainTextExtractor; this.Title = title; this.HtmlText = htmlText; this.TitlePath = titlePath; this.SummaryPath = summaryPath; this.TypePath = typePath; this.Type = type; }
/// <summary> /// Query all articles in specific dataset, and use a given extractor as the HTML extractor /// </summary> /// <param name="extractor">The plain text extractor for article</param> /// <returns>List of articles</returns> public List<Article> Query(IHtmlPlainTextExtractor extractor) { List<Article> articles = new List<Article>(); // Get file IDs // Must assume: // * The filenames in title folder must be same as the ones in summary folder // * Filename starts from 0, increase by 1 string[] files = Directory.GetFiles(titlePath); System.Diagnostics.Debug.Assert(files.Length == Directory.GetFiles(summaryPath).Length); // Query each article for (int i = 0; i < files.Length; ++i) { articles.Add(Query(i, extractor)); } return articles; }
/// <summary> /// Constructor with specified dataset and default HTML extractor /// </summary> /// <param name="dataset">The query dataset</param> /// <param name="defaultExtractor">The HTML extractor for articles</param> public ArticleQuerier(QueryDataset dataset, IHtmlPlainTextExtractor defaultExtractor) { // Generate dataset path switch (dataset) { case QueryDataset.TrainDataset: this.datasetPath = Path.Combine(General.Constants.DatasetPath, General.Constants.TrainDatasetDir); break; case QueryDataset.TestDataset: this.datasetPath = Path.Combine(General.Constants.DatasetPath, General.Constants.TestDatasetDir); break; default: throw new NotImplementedException(); } this.titlePath = Path.Combine(datasetPath, Constants.TitleDir); this.summaryPath = Path.Combine(datasetPath, Constants.SummaryDir); this.typePath = Path.Combine(datasetPath, Constants.TypeDir); this.defaultExtractor = defaultExtractor; }
/// <summary> /// Query an article with given ID in specific dataset, and use a given extractor as the HTML extractor /// </summary> /// <param name="id">The ID of article</param> /// <param name="extractor">The HTML text extractor for article</param> /// <returns>The article</returns> public Article Query(int id, IHtmlPlainTextExtractor extractor) { string title; string summary; ArticleType type = ArticleType.NotSpecified; string titleFilePath = Path.Combine(titlePath, id.ToString()); string summaryFilePath = Path.Combine(summaryPath, id.ToString()); string typeFilePath = Path.Combine(typePath, id.ToString()); // Read title and summary try { FileStream titleFileStream = new FileStream(titleFilePath, FileMode.Open, FileAccess.Read); StreamReader titleFileStreamReader = new StreamReader(titleFileStream); title = titleFileStreamReader.ReadToEnd(); titleFileStreamReader.Close(); FileStream summaryFileStream = new FileStream(summaryFilePath, FileMode.Open, FileAccess.Read); StreamReader summaryFileStreamReader = new StreamReader(summaryFileStream); summary = summaryFileStreamReader.ReadToEnd(); summaryFileStreamReader.Close(); } catch (IOException) { return null; } // Read marked type try { if (File.Exists(typeFilePath)) { FileStream typeFileStream = new FileStream(typeFilePath, FileMode.Open, FileAccess.Read); StreamReader typeFileStreamReader = new StreamReader(typeFileStream); string typeString = typeFileStreamReader.ReadToEnd(); typeFileStreamReader.Close(); int typeInt = Convert.ToInt32(typeString); if (Enum.IsDefined(typeof(ArticleType), typeInt)) { type = (ArticleType)typeInt; } else { throw new FormatException(); } } else { type = ArticleType.NotSpecified; } } catch (Exception) { // Type not specified yet type = ArticleType.NotSpecified; } // Construct article Article article = new Article(extractor, title, summary, titleFilePath, summaryFilePath, typeFilePath, type); return article; }
/// <summary> /// An empty article with given HTML extractor /// </summary> /// <param name="htmlPlainTextExtractor">The specified HTML extractor</param> protected Article(IHtmlPlainTextExtractor htmlPlainTextExtractor) : this(htmlPlainTextExtractor, "", "") { }
/// <summary> /// An article with given HTML extractor, title, HTML text and article type /// </summary> /// <param name="htmlPlainTextExtractor">The specified HTML extractor</param> /// <param name="title">Title of article</param> /// <param name="htmlText">HTML text of article</param> /// <param name="type">Type of article</param> public Article(IHtmlPlainTextExtractor htmlPlainTextExtractor, string title, string htmlText, ArticleType type = ArticleType.NotSpecified) : this(htmlPlainTextExtractor, title, htmlText, "", "", "", type) { }