Ejemplo n.º 1
0
 /// <summary>
 /// An article with given HTML extractor, title, HTML text and article type
 /// </summary>
 /// <param name="htmlPlainTextExtractor">The specified HTML extractor</param>
 /// <param name="title">Title of article</param>
 /// <param name="htmlText">HTML text of article</param>
 /// <param name="titlePath">File path of title</param>
 /// <param name="summaryPath">File path of summary</param>
 /// <param name="typePath">File path of type</param>
 /// <param name="type">Type of article</param>
 public Article(IHtmlPlainTextExtractor htmlPlainTextExtractor, string title, string htmlText, string titlePath, string summaryPath, string typePath, ArticleType type = ArticleType.NotSpecified)
 {
     this.htmlPlainTextExtractor = htmlPlainTextExtractor;
     this.Title = title;
     this.HtmlText = htmlText;
     this.TitlePath = titlePath;
     this.SummaryPath = summaryPath;
     this.TypePath = typePath;
     this.Type = type;
 }
        /// <summary>
        /// Query all articles in specific dataset, and use a given extractor as the HTML extractor
        /// </summary>
        /// <param name="extractor">The plain text extractor for article</param>
        /// <returns>List of articles</returns>
        public List<Article> Query(IHtmlPlainTextExtractor extractor)
        {
            List<Article> articles = new List<Article>();

            // Get file IDs
            // Must assume:
            // * The filenames in title folder must be same as the ones in summary folder
            // * Filename starts from 0, increase by 1
            string[] files = Directory.GetFiles(titlePath);
            System.Diagnostics.Debug.Assert(files.Length == Directory.GetFiles(summaryPath).Length);

            // Query each article
            for (int i = 0; i < files.Length; ++i)
            {
                articles.Add(Query(i, extractor));
            }

            return articles;
        }
        /// <summary>
        /// Constructor with specified dataset and default HTML extractor
        /// </summary>
        /// <param name="dataset">The query dataset</param>
        /// <param name="defaultExtractor">The HTML extractor for articles</param>
        public ArticleQuerier(QueryDataset dataset, IHtmlPlainTextExtractor defaultExtractor)
        {
            // Generate dataset path
            switch (dataset)
            {
                case QueryDataset.TrainDataset:
                    this.datasetPath = Path.Combine(General.Constants.DatasetPath, General.Constants.TrainDatasetDir);
                    break;
                case QueryDataset.TestDataset:
                    this.datasetPath = Path.Combine(General.Constants.DatasetPath, General.Constants.TestDatasetDir);
                    break;
                default:
                    throw new NotImplementedException();
            }
            this.titlePath = Path.Combine(datasetPath, Constants.TitleDir);
            this.summaryPath = Path.Combine(datasetPath, Constants.SummaryDir);
            this.typePath = Path.Combine(datasetPath, Constants.TypeDir);

            this.defaultExtractor = defaultExtractor;
        }
        /// <summary>
        /// Query an article with given ID in specific dataset, and use a given extractor as the HTML extractor
        /// </summary>
        /// <param name="id">The ID of article</param>
        /// <param name="extractor">The HTML text extractor for article</param>
        /// <returns>The article</returns>
        public Article Query(int id, IHtmlPlainTextExtractor extractor)
        {
            string title;
            string summary;
            ArticleType type = ArticleType.NotSpecified;

            string titleFilePath = Path.Combine(titlePath, id.ToString());
            string summaryFilePath = Path.Combine(summaryPath, id.ToString());
            string typeFilePath = Path.Combine(typePath, id.ToString());

            // Read title and summary
            try
            {
                FileStream titleFileStream = new FileStream(titleFilePath, FileMode.Open, FileAccess.Read);
                StreamReader titleFileStreamReader = new StreamReader(titleFileStream);
                title = titleFileStreamReader.ReadToEnd();
                titleFileStreamReader.Close();

                FileStream summaryFileStream = new FileStream(summaryFilePath, FileMode.Open, FileAccess.Read);
                StreamReader summaryFileStreamReader = new StreamReader(summaryFileStream);
                summary = summaryFileStreamReader.ReadToEnd();
                summaryFileStreamReader.Close();
            }
            catch (IOException)
            {
                return null;
            }

            // Read marked type
            try
            {
                if (File.Exists(typeFilePath))
                {
                    FileStream typeFileStream = new FileStream(typeFilePath, FileMode.Open, FileAccess.Read);
                    StreamReader typeFileStreamReader = new StreamReader(typeFileStream);
                    string typeString = typeFileStreamReader.ReadToEnd();
                    typeFileStreamReader.Close();
                    int typeInt = Convert.ToInt32(typeString);
                    if (Enum.IsDefined(typeof(ArticleType), typeInt))
                    {
                        type = (ArticleType)typeInt;
                    }
                    else
                    {
                        throw new FormatException();
                    }
                }
                else
                {
                    type = ArticleType.NotSpecified;
                }
            }
            catch (Exception)
            {
                // Type not specified yet
                type = ArticleType.NotSpecified;
            }

            // Construct article
            Article article = new Article(extractor,
                title, summary,
                titleFilePath, summaryFilePath, typeFilePath,
                type);

            return article;
        }
Ejemplo n.º 5
0
 /// <summary>
 /// An empty article with given HTML extractor
 /// </summary>
 /// <param name="htmlPlainTextExtractor">The specified HTML extractor</param>
 protected Article(IHtmlPlainTextExtractor htmlPlainTextExtractor)
     : this(htmlPlainTextExtractor, "", "")
 {
 }
Ejemplo n.º 6
0
 /// <summary>
 /// An article with given HTML extractor, title, HTML text and article type
 /// </summary>
 /// <param name="htmlPlainTextExtractor">The specified HTML extractor</param>
 /// <param name="title">Title of article</param>
 /// <param name="htmlText">HTML text of article</param>
 /// <param name="type">Type of article</param>
 public Article(IHtmlPlainTextExtractor htmlPlainTextExtractor, string title, string htmlText, ArticleType type = ArticleType.NotSpecified)
     : this(htmlPlainTextExtractor, title, htmlText, "", "", "", type)
 {
 }