public Protocol Parse(KnessetContext context) { PrepareHashsets(context); // prefetch some existing objects from the DB so that we can check quickly of objects are new or existing var ret = new Protocol(); XDocument doc = XDocument.Load(fileName); // load the xml document using linq2xml // find the header element of the document var docHeader = (from e in doc.Document.Root.Elements(pkg + "part") where e.Attribute(pkg + "name").Value == "/word/header2.xml" select e.Element(pkg + "xmlData").Element(w + "hdr").Elements() .Skip(1) // skip page number .Take(1).ToArray() // take title ).First(); // find the body var docBody = (from e in doc.Document.Root.Elements(pkg + "part") where e.Attribute(pkg + "name").Value == "/word/document.xml" select e.Element(pkg + "xmlData").Element(w + "document").Element(w + "body")).First(); // parse some protocol metadata from the header ParseHeaderMetadata(ret, docHeader); // parse rest of metadata and read all the content from the body ParseBody(ret, docBody, context); return(ret); }
public AddProtocolWindow(string fileName) { InitializeComponent(); try { // Load the protocol file because the window is shown // because the protocol object will be binded to the ui presented to the user. // we're using only one context object and not disposing each time because another // context would consider fetched db entities as new and try to insert them. // the down side is that the app memory usage will not be optimal when importing many protocols. context = new KnessetContext(); fileParser = new ProtocolFileParser(fileName); Protocol = fileParser.Parse(context); DataContext = Protocol; // Databind the UI to the protocol object. DisplayStatsDebug(); } catch (Exception ex) { // if there was a problem show a message to the user, protocol will be null and the user // of this window will not show it. if (!AutoSaveAll) { MessageBox.Show(ex.ToString(), fileName, MessageBoxButton.OK, MessageBoxImage.Asterisk); } } }
private void DeleteGroup(object sender, RoutedEventArgs e) { try { if (GroupsBox.SelectedItems.Count == 0) { return; } using (KnessetContext context = new KnessetContext()) { foreach (var wg in GroupsBox.SelectedItems.OfType <WordsGroup>()) { context.WordsGroups.Attach(wg); context.WordsGroups.Remove(wg); } context.SaveChanges(); } UIData.UpdateData(false, true); } catch (Exception ex) { MessageBox.Show(ex.ToString()); } finally { Mouse.OverrideCursor = null; } }
private void InitStatistics(object sender, RoutedEventArgs e) { // note to db admins, // there is no reason this should be slow // try setting: // query_cache_limit=20K // query_cache_size=4M // query_cache_type=1 try { Mouse.OverrideCursor = Cursors.Wait; var res = new GeneralStatisticsResult(); using (var context = new KnessetContext()) { int numWordsSpoken = context.ParagraphWords.Count(); int numParagraphs = context.Paragraphs.Count(); res.NumProtocols = context.Protocols.Count(); res.ParagraphsPerProtocol = (float)numParagraphs / res.NumProtocols; /* * calculate speakers by protocol * the key is that each speaker has only one paragraph with pn_ph_number=1 * SELECT AVG(sq.c) as v FROM ( * SELECT COUNT(*) as c * FROM paragraph p * WHERE p.pn_pg_number = 1 * GROUP BY p.c_name, p.pr_number * ) as sq */ res.SpeakersPerProtocol = (float)context.Paragraphs .Where(x => x.pn_pg_number == 1) .GroupBy(x => new { x.c_name, x.pr_number }) .Select(r => r.Count()) .Average(); res.ParagraphsPerProtocolSpeaker = res.ParagraphsPerProtocol / res.SpeakersPerProtocol; res.WordsPerProtocol = (float)numWordsSpoken / res.NumProtocols; res.WordsPerParagraph = (float)numWordsSpoken / numParagraphs; } DataContext = res; } catch (Exception ex) { MessageBox.Show(ex.ToString()); Mouse.OverrideCursor = null; Close(); } finally { Mouse.OverrideCursor = null; } }
public WordFrequenciesData() { try { using (KnessetContext context = new KnessetContext()) { var emptyStringList = new string[] { string.Empty }; AllCommittees = emptyStringList.Union(context.Committees.Select(x => x.c_name).OrderBy(x => x)).ToList(); AllSpeakers = emptyStringList.Union(context.Paragraphs.Select(x => x.pn_name).Distinct().OrderBy(x => x)).ToList(); } } catch (Exception ex) { Console.Error.WriteLine(ex); } }
private void LoadProtocol(Protocol p, Paragraph initialPg) { Mouse.OverrideCursor = Cursors.Wait; try { using (KnessetContext context = new KnessetContext()) { // load protocol + paragraphs + words protocol = context.Protocols.Include("paragraphs.words").First(x => x.c_name == p.c_name && x.pr_number == p.pr_number); var entry = context.Entry(protocol); entry.Collection(pr => pr.persence).Load(); // load presence entry.Collection(pr => pr.invitations).Load(); // load invitations if (initialPg != null) { initialParagraph = protocol.paragraphs.FirstOrDefault(pg => pg.pg_number == initialPg.pg_number); } } } finally { Mouse.OverrideCursor = null; } }
// parse rest of metadata and read all the content from the body private void ParseBody(Protocol ret, XElement docBody, KnessetContext context) { var state = ProtocolState.InitialScan; // uae a state machine for different protocol parts bool isInvitationsTableRtl = false; Person currentSpeaker = null; foreach (XElement el in docBody.Elements()) { switch (state) { case ProtocolState.InitialScan: if (IsCustomXml(el, "פרוטוקול")) { // try to read the protocol number from the custom XML ret.pr_number = int.Parse(el.Element(w + "customXmlPr").Elements(w + "attr").First(x => x.Attribute(w + "name").Value == "Num").Attribute(w + "val").Value); if (ret.pr_number == 0) { // if the protocol number is not in the custom XML, read from the protocol title int tmp; // handle the last number in the protocol title as the protocol number string pNumString = (from r in el.Element(w + "p").Elements(w + "r") let str = r.Element(w + "t").Value.Trim() let rgxMatch = Regex.Match(str, "(\\d+)\\s*$") where rgxMatch.Success select rgxMatch.Groups[1].Value).LastOrDefault() ?? ""; if (int.TryParse(Regex.Replace(pNumString, "[^\\d]", ""), out tmp)) { ret.pr_number = tmp; } // else we have a protocol without a number (?) } // read the protocol date from the custom XML attribute. ret.pr_date = DateTime.ParseExact(el.Element(w + "customXmlPr").Elements(w + "attr").First(x => x.Attribute(w + "name").Value == "Date").Attribute(w + "val").Value, "dd/MM/yyyy", CultureInfo.InvariantCulture); // the next state is some protocol info state = ProtocolState.ProtocolInfo; } break; case ProtocolState.ProtocolInfo: if (IsCustomXml(el, "סדר_יום")) // read info until we read the Agenda { state = ProtocolState.Agenda; } break; case ProtocolState.Agenda: if (IsSubject(el)) { // we've reached the subject but we'll have another subject so we ignore this one state = ProtocolState.SubjectLong; } break; case ProtocolState.SubjectLong: // ignore everything until we reach the list of members that are present if (ContainsCustomXml(el, "חברי_הוועדה")) { state = ProtocolState.CommitteeMembers; } break; case ProtocolState.CommitteeMembers: // read the list of members that are present until we reach the invitations part if (el.Name == w + "p") { if (ContainsCustomXml(el, "מוזמנים")) { // finished presence and now moving on to the invitations list state = ProtocolState.Invitations; break; } else if (IsSubject(el)) { // no invitations state = ReadSubject(ret, el); break; } string content = ReadParagraph(el); if (string.IsNullOrWhiteSpace(content) || content == "חברי הכנסת:") { break; } if (content.Contains("מוזמנים:")) { // in some rare cases the invitations title is not marked by custom XML and we need to check the text for it. state = ProtocolState.Invitations; break; } // if this is not a special line we got a presence to add. var person = FindOrAddPerson(content); if (!newPresence.Any(x => x.person == person)) { newPresence.Add(new Presence { person = person, protocol = ret }); } } break; case ProtocolState.Invitations: if (el.Name == w + "tbl") { // invitations of non-committee members are stored in a table // if table is from right to left the first column has the name of the person // else the last column (3rd) isInvitationsTableRtl = el.Descendants(w + "bidiVisual").Any(); var items = from tr in el.Elements(w + "tr") select ReadParagraph((isInvitationsTableRtl ? tr.Elements(w + "tc").First() : tr.Elements(w + "tc").Last()).Element(w + "p")); foreach (var invitation in items.Distinct()) { newInvitations.Add(new Invitation { person = FindOrAddPerson(invitation), protocol = ret }); } } else if (IsSubject(el)) { // after the invitations table we go on reading until we reach the header before the protocol starts // no we read the protocol title. state = ReadSubject(ret, el); } break; case ProtocolState.Subject: // after the subject heading we expect the actual protocol to start. state = ProtocolState.Talking; goto case ProtocolState.Talking; case ProtocolState.Talking: if (currentSpeaker != null && el.Name == w + "p") { // if we're in a 'normal' paragraph and we have a speaker then we the parahraph with the speaker. string paragraphContent = ReadParagraph(el); AddParagraph(ret, currentSpeaker, paragraphContent, context); } else if (ContainsCustomXml(el, "יור") || ContainsCustomXml(el, "דובר") || ContainsCustomXml(el, "דובר_המשך") || ContainsCustomXml(el, "אורח")) { var contentElem = (el.Name == w + "p") ? el.Element(w + "customXml") : el.Element(w + "p"); // this is a new speaker starting to talk string sprekerName = ReadParagraph(contentElem); if (sprekerName.EndsWith(":")) { sprekerName = sprekerName.Substring(0, sprekerName.Length - 1); } currentSpeaker = FindOrAddPerson(sprekerName); } else if (IsCustomXml(el, "קריאה")) { // sometimes people make indistinguishable noises, just ignore... currentSpeaker = null; } else if (IsCustomXml(el, "סיום")) { // we've reached the end of the protocol. state = ProtocolState.Finished; } break; case ProtocolState.Finished: break; } } if (state != ProtocolState.Finished && newParagraphs.Count == 0) { throw new Exception(string.Format("Protocol parsing failed, did not pass state {0}", state)); } }
// preload db objects so we don't try to add them again by mistake and so we can reference them from new objects. private void PrepareHashsets(KnessetContext context) { existingPersons = context.Persons.ToDictionary(x => x.pn_name); existingWords = context.Words.ToDictionary(x => x.word); }
// helpers: // parse paragraph words and fillers, convert text to objects with word offsets, etc... private void AddParagraph(Protocol protocol, Person speaker, string paragraphContent, KnessetContext context) { if (string.IsNullOrWhiteSpace(paragraphContent)) { return; } if (!speakerParahraphs.ContainsKey(speaker.pn_name)) { speakerParahraphs.Add(speaker.pn_name, 0); } Paragraph p = new Paragraph { protocol = protocol, speaker = speaker, pg_number = newParagraphs.Count + 1, pn_pg_number = ++speakerParahraphs[speaker.pn_name] }; newParagraphs.Add(p); int offset = 0; StringBuilder fillBuilder = new StringBuilder(); paragraphReader.Read(paragraphContent, word => { Word wordObj; if (!existingWords.ContainsKey(word)) { wordObj = new Word { word = word }; newWords.Add(wordObj); existingWords.Add(word, wordObj); } else { wordObj = existingWords[word]; } newParagraphWords.Add(new ParagraphWord { paragraph = p, WordObj = wordObj, pg_offset = offset, word_number = p.pg_num_words++ }); offset += word.Length; }, filler => { fillBuilder.Append(filler); offset += filler.Length; }); p.pg_space_fillers = fillBuilder.ToString(); }
private void SaveGroup(object sender, RoutedEventArgs e) { // first do validations that do not reqire db access string grpName = groupNameTxt.Text; if (grpName.Length == 0) { MessageBox.Show("חובה להזין שם לקבוצה"); groupNameTxt.Focus(); return; } ParagraphReader reader = new ParagraphReader(); // use paragraph read to remove all "non-word" chars and split into words HashSet <string> items = new HashSet <string>(reader.ReadWords(wordsListTxt.Text)); if (items.Count == 0) { MessageBox.Show("חובה להזין מילים לקבוצה"); wordsListTxt.SelectAll(); return; } else if (items.Count > WordsGroup.MaxItemsInGroup) { MessageBox.Show(string.Format("לא ניתן להזין יותר מ-{0} מילים בקבוצה", WordsGroup.MaxItemsInGroup)); return; } try { using (KnessetContext context = new KnessetContext()) { // now do validations that do reqire db access WordsGroup existing = context.WordsGroups.Find(grpName); if (existing != null) { MessageBox.Show("כבר קיימת קבוצה עם שם זה"); return; } // input is OK, save the new group, if a word is not in the words relation add it // (we might define groups before loading protocols) WordsGroup group = new WordsGroup { g_name = grpName }; context.WordsGroups.Add(group); foreach (var wordStr in items) { Word wordObj = context.Words.Find(wordStr); if (wordObj == null) { wordObj = new Word { word = wordStr }; context.Words.Add(wordObj); } context.WordInGroups.Add(new WordInGroup { wordsGroup = group, WordObj = wordObj }); } context.SaveChanges(); // commit all changes to DB } DialogResult = true; // can be used by parent window - marks success Close(); } catch (Exception ex) { MessageBox.Show(ex.ToString()); } }
private void CreateIndex(object sender, RoutedEventArgs e) { var selectedGroups = GroupsBox.SelectedItems.OfType <WordsGroup>().ToList(); var selectedProtocols = ProtocolsBox.SelectedItems.OfType <Protocol>().ToList(); if (selectedGroups.Count == 0 && selectedProtocols.Count == 0) { MessageBox.Show("חובה לבחור פרוטוקולים או קבוצות"); return; } SaveFileDialog sfd = new SaveFileDialog { Filter = "Text Index File | *.txt", AddExtension = true }; if (sfd.ShowDialog().GetValueOrDefault()) // if user has chosen dest file and approved { try { Mouse.OverrideCursor = Cursors.Wait; using (StreamWriter sw = new StreamWriter(sfd.FileName, false, Encoding.Unicode)) // create a write stream to the file { using (KnessetContext context = new KnessetContext()) // create a db connection { string _lastWord = null; Action <ParagraphWord> writePW = (ParagraphWord pw) => // a lambda func to write a single word to the index { if (pw.word != _lastWord) { _lastWord = pw.word; sw.WriteLine(pw.word); } sw.WriteLine("\t{0} [{1}] Paragraph {2} Word {3}", pw.c_name, pw.pr_number, pw.pg_number, pw.word_number); sw.WriteLine("\t{0} [{1}] Speaker {2} Paragraph {3} Offset {4}", pw.c_name, pw.pr_number, pw.paragraph.pn_name, pw.paragraph.pn_pg_number, pw.pg_offset); }; IQueryable <ParagraphWord> wordsListQuery = context.ParagraphWords.Include("paragraph"); // a query variable to fetch words with their paragraph info if (selectedGroups.Count > 0) { string[] selectedGroupsWords = distinctFromGroups(selectedGroups); wordsListQuery = wordsListQuery.Where(x => selectedGroupsWords.Contains(x.word)); // filter by word group[s] } if (selectedProtocols.Count == 0) { foreach (var pw in wordsListQuery.OrderBy(x => x.word)) // if not filtering by protocol just write all the results { writePW(pw); } } else { // else fetch for each protocol seperatly (we do not have a way to create an OR condition as large as we want) // we prefetch all the data because we need to sort in memoty by the word // this has OK performence, if it wouldn't we would run a custom SQL query. List <ParagraphWord> wordsList = new List <ParagraphWord>(); foreach (var protocol in selectedProtocols) { wordsList.AddRange(wordsListQuery.Where(x => x.c_name == protocol.c_name && x.pr_number == protocol.pr_number)); } foreach (var pw in wordsList.OrderBy(x => x.word)) { writePW(pw); } } } } } catch (Exception ex) { MessageBox.Show(ex.ToString()); return; } finally { Mouse.OverrideCursor = null; } // display a success message to the user MessageBox.Show(sfd.FileName, "האינדקס נשמר", MessageBoxButton.OK, MessageBoxImage.Information); } }