/
pdfIndexing.cs
94 lines (83 loc) · 3.43 KB
/
pdfIndexing.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Lucene.Net;
using Lucene.Net.Store;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using iTextSharp.text;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
/**
* This program indexes .pdf files into a Lucene document.
**/
class pdfIndexing
{
static void Main(string[] args)
{
// Path to index file.
Directory pdfIndex = FSDirectory.Open(@"/* PASTE THE PATH WHERE YOUR INDEX WILL BE SAVED */");
// Creating analyzer to make index searchable.
Analyzer analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
// Creating IndexWriter
IndexWriter.MaxFieldLength mfl = new IndexWriter.MaxFieldLength(100000);
IndexWriter writer = new IndexWriter(pdfIndex, analyzer, mfl);
// Full path to input .pdf files.
string[] filesList = System.IO.Directory.GetFiles(@"/* PASTE THE PATH TO YOUR INPUT FILE(S) */", "*.pdf");
/* INDEX FIELDS:
** id & body are the fields to my Lucene Index,
** you can change those fields accordingly to your
** needs
*/
int idNumber = 0;
string pdfText;
foreach (string file in filesList)
{
pdfText = GetAllText(file);
AddToIndex(idNumber, pdfText, writer);
idNumber++;
}
writer.Dispose();
DisplayAllInput(filesList);
Console.ReadKey();
}
// ========================== ADDITIONAL METHODS ==========================
// This method uses iTextSharp library to extract the whole text from a PDF.
public static string GetAllText(String pdfPath)
{
iTextSharp.text.pdf.PdfReader reader = new iTextSharp.text.pdf.PdfReader(pdfPath);
System.IO.StringWriter output = new System.IO.StringWriter();
for (int i = 1; i <= reader.NumberOfPages; i++)
output.WriteLine(iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
return output.ToString();
}
// This method indexes the given text.
private static void AddToIndex(int id, string text, IndexWriter writer)
{
Term term = new Term("id", id.ToString());
Lucene.Net.Documents.Document doc = new Lucene.Net.Documents.Document();
doc.Add(new Field("id", id.ToString(), Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("mainText", text, Field.Store.YES, Field.Index.ANALYZED));
writer.AddDocument(doc);
writer.UpdateDocument(term, doc);
}
// This method displays all files in input directory.
private static void DisplayAllInput(string[] files)
{
int index = 1;
foreach (string file in files)
{
Console.WriteLine("Text body of file {0}: {1}", index, System.IO.File.ReadAllText(file));
index++;
Console.ReadKey();
}
index--;
Console.WriteLine("Ok, done!The total number of files in your input directory, is {0}", index);
}
}