/************************************************************************ * $Id: emsocr.cpp 649 2011-03-10 12:42:36Z elzubeir $ * * ------------ * Description: * ------------ * * (C) Copyright 2009,2010 ALLCONTENT. All rights reserved. * * ----------------- * Revision Details: (Updated by Revision Control System) * ----------------- * $Date: 2011-03-10 16:42:36 +0400 (Thu, 10 Mar 2011) $ * $Author: elzubeir $ * $Revision: 649 $ * $HeadURL: file:///opt/svn/socialhose/trunk/app/emsOCR/emsocr.cpp $ * ************************************************************************/ #include "idrs_config.h" #include "emsocr.h" #include "emsocrdialog.h" #include "ui_emsocrdialog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "coordinates.h" #include "word.h" #include "tag.h" using namespace std; //! output log file ofstream out; //! global stop flag bool stop = false; //! mutes QMutex mutex; //! the setup file structure for hte idrs IDRS_FILE_SETUP_INFO gSampleSetupInfo; //! number of pages to read from db #define MAX_PAGES 5 //bool emsOCR::m_stop = false; emsOCR::emsOCR() { stop = false; m_start = true; m_debug = false; m_sleep = 100; //100 seconds; m_processArabic = false; m_useAPPreprocessing = false; m_useAPDarkBorderRemoval = false; m_useAPBinarization = false; m_useAPLineRemoval = false; m_useAPDespeckle = false; m_useAPDespeckleValue = 10; m_useSkew = false; m_useBinarization = false; m_useDespeckle = false; m_useDespeckleValue = 10; } emsOCR::~emsOCR() { //CIDRS::Unload (); //unload the module } void emsOCR::Stop() { QMutexLocker locker(&mutex); stop = true; m_start = false; } void emsOCR::SetOCRDialog(emsOCRDialog *dlg) { m_ocrDialog = dlg; } bool emsOCR::ReadSettings() { QString config = QApplication::applicationDirPath()+ "/config.ini"; QSettings settings(config, QSettings::IniFormat); m_id = settings.value("ocr/id").toInt(); m_server = settings.value("database/server").toString(); //read database's server value from settings m_databaseName = settings.value("database/database").toString(); //read database's database name value from settings m_user = settings.value("database/uid").toString(); //read database's user id value from settings m_password = settings.value("database/pwd").toString(); //read database's password value from settings m_port = settings.value("database/port").toInt(); //read database's port value from settings m_ocrserver = settings.value("ocr database/server").toString(); //read database's server value from settings m_ocrdatabaseName = settings.value("ocr database/database").toString(); //read database's database name value from settings m_ocruser = settings.value("ocr database/uid").toString(); //read database's user id value from settings m_ocrpassword = settings.value("ocr database/pwd").toString(); //read database's password value from settings m_ocrport = settings.value("ocr database/port").toInt(); //read database's port value from settings m_debug = (settings.value("debug mode/debug").toString().toLower() == "on"); m_sleep = settings.value("sleep time/sleep").toInt(); m_processArabic = (settings.value("process arabic/process").toString().toLower() == "on"); m_useAPPreprocessing = (settings.value("preprocessing mode/preprocessing").toString().toLower() == "on"); m_useAPDarkBorderRemoval = (settings.value("preprocessing mode/darkborder").toString().toLower() == "on"); m_useAPBinarization = (settings.value("preprocessing mode/binarization").toString().toLower() == "on"); m_useAPLineRemoval = (settings.value("preprocessing mode/lineremoval").toString().toLower() == "on"); m_useAPDespeckle = (settings.value("preprocessing mode/despeckle").toString().toLower() == "on"); m_useAPDespeckleValue = settings.value("preprocessing mode/despecklevlaue").toInt(); m_useSkew = (settings.value("processing mode/skew").toString().toLower() == "on"); m_useBinarization = (settings.value("processing mode/binarization").toString().toLower() == "on"); m_useDespeckle = (settings.value("processing mode/despeckle").toString().toLower() == "on"); m_useDespeckleValue = settings.value("processing mode/despecklevlaue").toInt(); return true; } bool emsOCR::Initalize() { m_isReady = true; IDRS_FILE_SETUP_INFO aSampleSetupInfo; //* m_isReady = false; // m_debug = true; if (!ReadSettings() ) { m_isReady = false; //exit(0); out.open("ems-log.txt", ios::app); out << "error reading settings file"; out.close(); return false; } if(m_debug) { const int MAX_LOG_FILE = 1024*1024*10; //QMessageBox::information(NULL, "debug", "debug enabled"); QFile file("ems-log.txt"); if(file.size() >= MAX_LOG_FILE) out.open("ems-log.txt", ios::out); else out.open("ems-log.txt", ios::app); if(!out.is_open()) { QMessageBox::warning(NULL, "error opening", "error opening ems-log.txt file"); } else out << "This is a sample test of the file " << endl; } try { if ( SetupIDRS ( aSampleSetupInfo ) != 0 ) { return false; //exit(0); //return false; } else { m_isReady = true; } } catch ( IDRSException & aIDRSException ) { CIDRS::Unload (); ShowException ( aIDRSException ); return false; //exit(0);//return false; } //check if the advance pre-processor loaded if ( IDRS::CIDRS::IsExtensionReady ( IDRS_MODULE_PREPRO, IDRS_PREPRO_EXTENSION_ADVANCED_PREPRO )) { out << "Advance Preprocessing module is loaded correctly" << endl; m_isAdvancedPreprocessingLoaded = true; } else { out << "Advance Preprocessing module is NOT loaded " << endl; m_isAdvancedPreprocessingLoaded = false; } // Create the iDRS objects try { m_reader = CReader::Create (); } catch ( IDRSException & aIDRSException ) { m_reader = NULL; m_isReady = false; ShowException ( aIDRSException ); CIDRS::Unload (); return false; //exit(0); //return false; } //*/ if (!ReadSettings() ) { m_isReady = false; //exit(0); return false; } if (!ConnectToDatabase() ) { //erro handling show be done here if(m_debug) out << CurrentDateTime().toStdString() << "\tCould not connect to DB" << endl; m_isReady = false; //exit(0); return false; } GetIssuePath(); return m_isReady; } void emsOCR::run() { stop = false; m_start = true; m_pagesToProcess.clear(); Process(); } void emsOCR::Process() { while(m_start) { GetPagesToProcess(); if(m_pagesToProcess.size() == 0) { //no pages to process, sleep for 2 seconds //QtServiceBase::instance()->logMessage("No pages to process, sleeping for 2 seconds"); if(m_debug) out << CurrentDateTime().toStdString() << "\tNo pages to process, sleeping for "<< m_sleep <<" seconds\r"; UpdateOCRStatus(0, 0, 0); Sleep(m_sleep*1000); continue; } for(int i=0; i< m_pagesToProcess.size(); i++) { { QMutexLocker locker(&mutex); if( stop ) break; } if(m_debug) out << CurrentDateTime().toStdString() << " # of pages to process: " << m_pagesToProcess.size() << ", now processing page # " << i+1 << endl; /*if( !m_pagesToProcess[i].m_isCopied ) { if(m_debug) out << CurrentDateTime().toStdString() << "\tPage " << m_pagesToProcess[i].m_localPath.toStdString() << " could not be copied, skipping to next page" << endl; continue; //if its not copied, then we just move to the next copied page }*/ m_currentIndex = i; QString pagePath = m_pagesToProcess[i].m_remotePath; //QtServiceBase::instance()->logMessage( QString("start process of page %1").arg(pagePath) ); if(m_debug) out << CurrentDateTime().toStdString() << "\tStart process of page " << pagePath.toStdString() << endl; SetPagesInView(); if(m_pagesToProcess[i].m_isArabic && !m_processArabic) //skip arabic pages. { if(m_debug) out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " SKIPPED, Arabic processing disabled" << endl; continue; } QTime t; t.start(); UpdateOCRStatus(m_pagesToProcess[i].m_publicationId, m_pagesToProcess[i].m_publicationIssue, m_pagesToProcess[i].m_pageName); bool ret = ProcessPage(pagePath, m_pagesToProcess[i].m_isArabic); double elapsed = t.elapsed()/1000.0; if(ret) { //we have a successfull processing, we do the insertion of the database, and updating of the status QList tags = SearchTags( m_pagesToProcess[i].m_isArabic ); AddToDatabase(m_pagesToProcess[i], tags); //clear text m_pageText = ""; m_words.clear(); //QtServiceBase::instance()->logMessage( QString("Process of page %1 completed successfully").arg(pagePath) ); if(m_debug) out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " completed successfully" << endl; m_pagesToProcess[i].m_isProcessed = true; UpdateStatus(m_pagesToProcess[i].m_sectionPagesId); AddPageToProcessedTable(ret, CurrentDateTime(), m_pagesToProcess[i].m_customName, elapsed, tags.count() ); IncrementTotalProcssedPages(); } else { //we need to re-process it another time //QtServiceBase::instance()->logMessage( QString("Process of page %1 unsuccessfull, re-process one more time").arg(pagePath) ); if(m_debug) out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " unsuccessfull, re-process one more time" << endl; bool ret = ProcessPage(pagePath, m_pagesToProcess[i].m_isArabic); if(ret) { //we have a successfull processing, we do the insertion of the database, and updating of the status QList tags = SearchTags( m_pagesToProcess[i].m_isArabic ); AddToDatabase(m_pagesToProcess[i], tags); //clear text m_pageText = ""; m_words.clear(); //QtServiceBase::instance()->logMessage( QString("Process of page %1 completed successfully").arg(pagePath) ); if(m_debug) out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " completed successfully" << endl; m_pagesToProcess[i].m_isProcessed = true; UpdateStatus(m_pagesToProcess[i].m_sectionPagesId); AddPageToProcessedTable(ret, CurrentDateTime(), m_pagesToProcess[i].m_customName, elapsed, tags.count() ); IncrementTotalProcssedPages(); } else { //we need to re-process it another time //QtServiceBase::instance()->logMessage( QString("Process of page %1 unsuccessfull, re-process one more time").arg(pagePath) ); if(m_debug) out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " unsuccessfull, SKIPPING this page" << endl; } } } //now delete the copied pages //DeleteFiles(); //and reset the pages that have not been processed to be processed. //ResetUnprocessedPages(); } ClearPagesInView(); if(m_pagesToProcess.size() > 0) ResetUnprocessedPages(); emit terminated(); } bool emsOCR::ConnectToDatabase() { m_database = QSqlDatabase::addDatabase("QMYSQL"); // initalize the connection m_database.setHostName(m_server); m_database.setPort(m_port); m_database.setDatabaseName(m_databaseName); m_database.setUserName(m_user); m_database.setPassword(m_password); // try to open the connection, if ok returns true, else returns false bool ok = m_database.open(); if(!ok) { // if error, get the error from the database text and print it to debug screen QString str = m_database.lastError().databaseText(); //QMessageBox::critical(this, tr("Database Error"), tr("Error connecting to master database\n") + str); //cout << str.toStdString(); return false; } m_ocrdatabase = QSqlDatabase::addDatabase("QMYSQL", "OCRDB"); // initalize the connection m_ocrdatabase.setHostName(m_ocrserver); m_ocrdatabase.setPort(m_ocrport); m_ocrdatabase.setDatabaseName(m_ocrdatabaseName); m_ocrdatabase.setUserName(m_ocruser); m_ocrdatabase.setPassword(m_ocrpassword); // try to open the connection, if ok returns true, else returns false ok = m_ocrdatabase.open(); if(!ok) { // if error, get the error from the database text and print it to debug screen QString str = m_database.lastError().databaseText(); //cout << str.toStdString(); return false; } return true; } void emsOCR::GetIssuePath() { QString issueQuery = "SELECT issues_path, issues_windows_drive_letter FROM system_configuration"; QSqlQuery query(m_database); query.exec(issueQuery); QString str = query.lastError().databaseText(); while( query.next() ) { QString path = query.value(0).toString(); QString drive = query.value(1).toString(); m_issuePath = drive + path; } } void emsOCR::UpdateStatus(int sectionPagesId) { if(m_debug) out << CurrentDateTime().toStdString() << "\t---- UpdateStatus()" << endl; QString lockQuery = "LOCK TABLE section_pages WRITE"; QString unlockQuery = "UNLOCK TABLES"; QString updateQuery = "UPDATE section_pages SET status = 3 WHERE section_pages = :section_pages"; QSqlQuery query(m_database); //if(m_debug) out << CurrentDateTime().toStdString() << "\t lock query: \"" << lockQuery.toStdString() << "\"" << endl; //if(m_debug) out << CurrentDateTime().toStdString() << "\t call lock query "; bool ret = query.exec( lockQuery ); query.prepare( updateQuery ); query.bindValue(":section_pages", sectionPagesId); ret = query.exec(); if(!ret) { if(m_debug) out << CurrentDateTime().toStdString() << "\tCall to update query returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().databaseText().toStdString() << endl; } //now unlock the pages //if(m_debug) out << CurrentDateTime().toStdString() << "\tUnlock query: \"" << unlockQuery.toStdString() << "\"" << endl; //if(m_debug) out << CurrentDateTime().toStdString() << "\tCall unlock query "; ret = query.exec( unlockQuery ); if(!ret) { if(m_debug) out << " returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().databaseText().toStdString() << endl; } if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of UpdateStatus() ---- " << endl; } void emsOCR::ResetUnprocessedPages() { if(m_debug) out << CurrentDateTime().toStdString() << " ---- ResetUnprocessedPages() ---- " << endl; QString lockQuery = "LOCK TABLE section_pages WRITE"; QString unlockQuery = "UNLOCK TABLES"; QString updateQuery = "UPDATE section_pages SET status = 1 WHERE section_pages = :section_pages"; QSqlQuery query(m_database); //first lock the page. query.exec( lockQuery ); //now change the status of the pages to being processed for(int i=0; i< m_pagesToProcess.size(); i++) { if(!m_pagesToProcess[i].m_isProcessed) { query.prepare( updateQuery ); query.bindValue(":section_pages", m_pagesToProcess[i].m_sectionPagesId); bool ret = query.exec(); } } //now unlock the pages query.exec( unlockQuery ); if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of ResetUnprocessedPages() ---- " << endl; } void emsOCR::GetPagesToProcess() { if(m_debug) out << CurrentDateTime().toStdString() << "\t---- GetPagesToProcess()" << endl; //first of all, try to lock the dummy table //then read the pages that needs to be processed, and get the first n pages (n is determined in the config files) //change their status to being processed //unlock the dummy table QString lockQuery = "LOCK TABLE section_pages WRITE"; QString lockQueryNoArabic = "LOCK TABLE section_pages WRITE, publication READ, publication_issue READ, issue_sections READ"; QString unlockQuery = "UNLOCK TABLES"; QString selectQuery = "SELECT section_pages, page_name FROM section_pages WHERE status = 1 LIMIT 5"; QString selectQueryNoArabic = " SELECT section_pages, page_name " " FROM " " publication, publication_issue, issue_sections, section_pages " " WHERE " " publication.id_publication = publication_issue.id_publication AND " " publication_issue.id_publication_issue = issue_sections.id_publication_issue AND " " issue_sections.id_issue_sections = section_pages.id_issue_sections AND " " language not like '%Arabic%' and status = 1 LIMIT 5"; QString updateQuery = "UPDATE section_pages SET status = 2 WHERE section_pages = :section_pages"; m_pagesToProcess.clear(); //QSqlQuery query = m_database.query(); QSqlQuery query(m_database); //first lock the page. if(m_debug) out << CurrentDateTime().toStdString() << "\t--->lock query: \"" << lockQuery.toStdString() << "\"" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\tcall lock query "; bool ret = false; if(m_processArabic) ret = query.exec( lockQuery ); else ret = query.exec( lockQueryNoArabic ); if(!ret) { if(m_debug) out << " returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\treason: " << query.lastError().databaseText().toStdString() << endl; return; } else if(m_debug) out << " returned true" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\tSelect query: \"" << selectQuery.toStdString() << "\"" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\tCall select query " ; if(!m_processArabic) ret = query.exec(selectQueryNoArabic); else ret = query.exec( selectQuery ); if(!ret) { if(m_debug) out << " returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\treason: " << query.lastError().databaseText().toStdString() << endl; return; } else if(m_debug) out << " returned true" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\tNumber of rows returned " << query.size() << endl; if( query.size() > 0 ) { while ( query.next() ) { //if( i >= MAX_PAGES) //break; QString sectionPages = query.value(0).toString(); QString pageName = query.value(1).toString(); if(m_debug) out << "\t\t\tAdd " << pageName.toStdString() << " to list of pages to be processed" << endl; PageInfo pi; pi.m_sectionPagesId = sectionPages.toInt(); pi.m_remotePath = pageName; pi.m_localPath = pageName; pi.m_pageName = pageName.remove(".jpg").toInt(); m_pagesToProcess.append(pi); } //now change the status of the pages to being processed for(int i=0; i< m_pagesToProcess.size(); i++) { query.prepare( updateQuery ); query.bindValue(":section_pages", m_pagesToProcess[i].m_sectionPagesId); bool ret = query.exec(); if(!ret) { if(m_debug) out << CurrentDateTime().toStdString() << "\tCall to update query returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().databaseText().toStdString() << endl; //return; } } } //now unlock the pages if(m_debug) out << CurrentDateTime().toStdString() << "\tUnlock query: \"" << unlockQuery.toStdString() << "\"" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\tCall unlock query "; ret = query.exec( unlockQuery ); if(!ret) { if(m_debug) out << " returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().databaseText().toStdString() << endl; return; } else if(m_debug) out << " returned true" << endl; if( m_pagesToProcess.size() > 0 ) ConstructPagesPath(); if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of GetPagesToProcess() ---- " << endl; } QString emsOCR::SectionName(int id) { QSqlQuery query(m_database); query.prepare("SELECT name_article_section_en FROM article_section WHERE id_article_section= :sectionId"); query.bindValue(":sectionId", id); if(!query.exec()) { //QMessageBox::critical(NULL, QObject::tr("SQL Error"), query.lastError().databaseText()); return ""; } QString name = ""; if(query.next()) name = query.value(0).toString(); return name; } void emsOCR::ConstructPagesPath() { if(m_debug) out << CurrentDateTime().toStdString() << " ---- ConstructPagesPath() ---- " << endl; QSqlQuery query (m_database); for(int i = 0; i < m_pagesToProcess.size(); i++) { ostringstream oss; oss << "SELECT issue_date, pi.id_publication, id_section, `language`, name_article_section_en, pi.name_publication_en, " << " pi.country, pi.id_publication_type, pis.id_publication_issue " << " FROM " << " publication as pi, publication_issue as pis, issue_sections as ise, section_pages as sp, article_section " << " WHERE " << " pi.id_publication = pis.id_publication AND " << " pis.id_publication_issue = ise.id_publication_issue AND " << " ise.id_issue_sections = sp.id_issue_sections AND " << " id_article_section = id_section AND " << " section_pages = " << m_pagesToProcess[i].m_sectionPagesId; QString str = QString::fromStdString( oss.str() ); query.exec( str ); while( query.next() ) { QString issueDate = query.value(0).toString(); QString publicationId = query.value(1).toString(); QString sectionID = query.value(2).toString(); QString language = query.value(3).toString(); QString sectionName = query.value(4).toString(); QString publicationName = query.value(5).toString(); int country = query.value(6).toInt(); int publicationType = query.value(7).toInt(); int publicationIssue = query.value(8).toInt(); //path is in the format of: Fileserver_path_Issues/YYYY/MM/DD/ID_PUBLICATION/ID_SECTION/ QStringList dateVecor = issueDate.split("-"); QString remotepath = QString("%1%2\\%3\\%4\\%5\\%6\\300dpi\\%7") .arg(m_issuePath) .arg(dateVecor[0]) .arg(dateVecor[1]) .arg(dateVecor[2]) .arg(publicationId) .arg(sectionID) .arg(m_pagesToProcess[i].m_remotePath); remotepath.replace("/", "\\"); QString customPagePath= QString("%1/%2/%3/%4") .arg(publicationName) .arg(issueDate) .arg(sectionName) .arg(m_pagesToProcess[i].m_remotePath); //m_issuePath + dateVecor[0] + "/" + dateVecor[1] + "/" + dateVecor[2] + "/" + publicationID + "/" + sectionID + "/300dpi/" + m_pagesToProcess[i].m_remotePath; QString localpath = QString("C:\\%1_%2_%3").arg(publicationId).arg(sectionID).arg(m_pagesToProcess[i].m_localPath); m_pagesToProcess[i].m_remotePath = remotepath; m_pagesToProcess[i].m_localPath = localpath; bool isArabic = ( language.compare("Arabic", Qt::CaseInsensitive) == 0 ); m_pagesToProcess[i].m_isArabic = isArabic; m_pagesToProcess[i].m_isCopied = false; m_pagesToProcess[i].m_isProcessed = false; m_pagesToProcess[i].m_customName = customPagePath; m_pagesToProcess[i].m_publicationId = publicationId.toInt(); m_pagesToProcess[i].m_publicationType = publicationType; m_pagesToProcess[i].m_publicationDate = issueDate; m_pagesToProcess[i].m_country = country; m_pagesToProcess[i].m_publicationIssue = publicationIssue; } } //copy Files from remoate path to local path //CopyFiles(); if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of ConstructPagesPath() ---- " << endl; } bool emsOCR::ProcessPage(QString &pagePath, bool isArabic) { if(m_debug) out << CurrentDateTime().toStdString() << " ---- ProcessPage() ---- " << endl; IDRS::CPage aPage; try { if(isArabic) { COCRContext aOCRContext = COCRContext::Create (IDRS_LNG_ARABIC); m_reader.SetOCRParameters(aOCRContext); } else { COCRContext aOCRContext = COCRContext::Create (IDRS_LNG_ENGLISH); m_reader.SetOCRParameters(aOCRContext); } // Load the page //aPage = IDRS::CPage::Create (); std::wstring page = pagePath.toStdWString(); IDRS::CImage aImage; aImage = IDRS::CImage::Create(); aImage.Load( (IDRS::IDRS_CTSTR) page.c_str() ); if(isArabic) { if(!m_processArabic) return true; //we will do the pre-processing in here if(m_debug) out << CurrentDateTime().toStdString() << " ---- Starting Advanced Pre-processing for Arabic ---- " << endl; if(m_useAPPreprocessing) { if(m_isAdvancedPreprocessingLoaded) { if(m_useAPDarkBorderRemoval) { IDRS::ADVANCED_PREPRO::CDarkBordersRemoval aDarkBordersRemoval; aDarkBordersRemoval = IDRS::ADVANCED_PREPRO::CDarkBordersRemoval::Create (); if(m_debug) out << CurrentDateTime().toStdString() << " Dark Border Removal the image " << endl; aDarkBordersRemoval.RemoveDarkBorders ( aImage, aImage ); } if(m_useAPBinarization && (aImage.GetImageType() != IDRS_IMG_BW)) { IDRS::ADVANCED_PREPRO::CBinarization aBinarization; aBinarization = IDRS::ADVANCED_PREPRO::CBinarization::Create (); if(m_debug) out << CurrentDateTime().toStdString() << " BinarizeGradl the image " << endl; aBinarization.BinarizeGradl ( aImage, aImage ); } if(m_useAPLineRemoval) { IDRS::ADVANCED_PREPRO::CLineRemoval aLineRemoval; aLineRemoval = IDRS::ADVANCED_PREPRO::CLineRemoval::Create (); if(m_debug) out << CurrentDateTime().toStdString() << " Line Removal the image " << endl; aLineRemoval.RemoveLines ( aImage ); } if(m_useAPDespeckle && (aImage.GetImageType() == IDRS_IMG_BW) ) { IDRS::ADVANCED_PREPRO::CDespeckle aDespeckle; aDespeckle = IDRS::ADVANCED_PREPRO::CDespeckle::Create (); if(m_debug) out << CurrentDateTime().toStdString() << " Despeckle the image " << endl; aDespeckle.Despeckle ( aImage, m_useAPDespeckleValue ); } } else QMessageBox::information(NULL, tr("Advanced Preprocessing"), tr("Advanced Preprocessing module is not loaded!")); } IDRS::CImgProcessor aImageProcessor; aImageProcessor = IDRS::CImgProcessor::Create (); if(m_useSkew) { if(m_debug) out << CurrentDateTime().toStdString() << " Skewing the image, " ; double dAngle = aImageProcessor.Deskew( aImage ); out << "Skew angle is " << dAngle << endl; } if(m_debug) out << CurrentDateTime().toStdString() << " ---- End Advanced Pre-processing for Arabic ---- " << endl; } else { IDRS::CImgProcessor aImageProcessor; aImageProcessor = IDRS::CImgProcessor::Create (); if(m_useSkew) { if(m_debug) out << CurrentDateTime().toStdString() << " Skewing the image, " ; double dAngle = aImageProcessor.Deskew( aImage ); out << "Skew angle is " << dAngle << endl; } // Binarises if ( m_useBinarization && (aImage.GetImageType() != IDRS_IMG_BW) ) { if(m_debug) out << CurrentDateTime().toStdString() << " BinarizeGradl the image " << endl; IDRS::CImgProcessor::BinarizeGradl ( aImage, aImage, IDRS_TRUE ); } // Despeckle if(m_useDespeckle && (aImage.GetImageType() == IDRS_IMG_BW)) { if(m_debug) out << CurrentDateTime().toStdString() << " Despeckle the image " << endl; IDRS::CImgProcessor::Despeckle ( aImage, m_useDespeckleValue ); } } if(m_debug) out << CurrentDateTime().toStdString() << " Create CPage()" << endl; aPage = IDRS::CPage::Create ( aImage ); // Do the recognition if(m_debug) out << CurrentDateTime().toStdString() << " Set Page to Reader" << endl; m_reader.Read ( aPage ); } catch ( IDRS::IDRSException & aIDRSException ) { ShowException ( aIDRSException ); aPage = NULL; if(m_debug) out << CurrentDateTime().toStdString() << "\t Error Loading page: " << pagePath.toStdString() << endl; //IDRS::CIDRS::Unload (); return false; } try { IDRS::CZone aZone; IDRS::CZoneMeaning aZoneMeaning; IDRS::IDRS_RECT aRect; IDRS::CMeaningElt aMeaningElt; IDRS::CMeaningEltFmtNfo aMeaningEltFmtNfo; IDRS::CMeaningEltTxt aMeaningEltTxt; IDRS::IDRS_UINT ui, uiZoneIndex, uiMeaningEltIndex; IDRS::IDRS_BOOL bFirstLine, bFirstWordInLine; bFirstLine = bFirstWordInLine = IDRS_TRUE; IDRS::IDRS_RECT rect; wstring pageText; if(m_debug) out << CurrentDateTime().toStdString() << " Start Text extraction" << endl; for ( uiZoneIndex = 0; uiZoneIndex < aPage.GetZoneCount (); uiZoneIndex ++ ) { aZone = aPage.GetZoneAt ( uiZoneIndex ); // Writes zone header aRect = aZone.GetBoundingRect (); aZoneMeaning = aZone.Meaning (); wstring str; int itop, ileft, ibottom, iright; if ( ! aZoneMeaning.IsNull ()) { for ( uiMeaningEltIndex = 0; uiMeaningEltIndex < aZoneMeaning.GetCount (); uiMeaningEltIndex ++ ) { aMeaningElt = aZoneMeaning.GetAt ( uiMeaningEltIndex ); switch ( aMeaningElt.GetMEType ()) { case IDRS_ME_TEXT: aMeaningEltTxt = aMeaningElt; for ( ui = 0; ui < aMeaningEltTxt.GetLength (); ui ++ ) { str += (wchar_t) aMeaningEltTxt.GetBuffer()[ui]; pageText += (wchar_t) aMeaningEltTxt.GetBuffer()[ui]; } break; case IDRS_ME_FMT_NFO: aMeaningEltFmtNfo = aMeaningElt; switch ( aMeaningEltFmtNfo.GetFmtNfoType ()) { case IDRS_FMT_NFO_WORD_START: aMeaningEltFmtNfo.GetFmtNfoValue(rect); if ( ! bFirstWordInLine ) { Word w(str, itop/3, ileft/3, ibottom/3, iright/3); m_words.append(w); str = L""; pageText += L" "; } else bFirstWordInLine = IDRS_FALSE; itop = rect.uiTop; ileft = rect.uiLeft; ibottom = rect.uiBottom; iright = rect.uiRight; break; case IDRS_FMT_NFO_LINE_START: if ( ! bFirstLine ) { pageText += L" "; if(str.length() > 0) { Word w(str, itop/3, ileft/3, ibottom/3, iright/3); m_words.append(w); str = L""; } aMeaningEltFmtNfo.GetFmtNfoValue(rect); itop = rect.uiTop; ileft = rect.uiLeft; ibottom = rect.uiBottom; iright = rect.uiRight; } else bFirstLine = IDRS_FALSE; bFirstWordInLine = IDRS_TRUE; break; case IDRS_FMT_NFO_PARA_START : pageText += L"\n"; default: break; } break; default: break; } } //add the last word of the line if(str.length() > 0) { Word w(str, itop/3, ileft/3, ibottom/3, iright/3); m_words.append(w); str = L""; } } } if(m_debug) out << CurrentDateTime().toStdString() << " End extraction" << endl; m_pageText = QString::fromStdWString( pageText ); if(m_debug) out << CurrentDateTime().toStdString() << " RemoveDashAndTheFollowingSpace()" << endl; RemoveDashAndTheFollowingSpace(m_pageText); //now remove dash from the word_list as well, and appened the next word to it. for(int i=0; i< m_words.count()-1; i++) { if( m_words[i].word().endsWith("-") ) { m_words[i].appendToWord(m_words[i+1].word(), true); m_words[i].AddCoordinates( m_words[i+1].coordinates() ); //now remove the i+1 entry m_words.removeAt(i+1); i--; } } } catch ( IDRS::IDRSException & aIDRSException ) { ShowException ( aIDRSException ); // aReader = NULL; // IDRS::CIDRS::Unload (); if(m_debug) out << CurrentDateTime().toStdString() << "\t Error Loading page: " << pagePath.toStdString() << endl; return false; } catch ( ... ) { //aReader = NULL; //IDRS::CIDRS::Unload (); //cout << "An error occured in the iDRS." << endl; return false; } if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of ProcessPage() ---- " << endl; return true; } void emsOCR::GetTagsFromDatabase(bool isArabic) { //check if we need to get a newer version of the database if(m_debug) out << CurrentDateTime().toStdString() << " ---- GetTagsFromDatabase() ---- " << endl; QSqlQuery query( m_database ); bool ret; //if(isArabic) //ret = query.exec( "SELECT id_company, name_company as name_company FROM company WHERE date_end > NOW() AND name_company_ar not like ''" ); //else ret = query.exec( "SELECT id_company, name_company FROM company WHERE date_end > NOW()" ); //if(ret) m_databaseTags.clear(); while( query.next() ) { int id = query.value(0).toInt(); QString tag = query.value(1).toString(); //QString::fromUtf8( query.value(1).toString().toAscii().data() ); TagInfo ti; ti.m_id = id; ti.m_name = tag; m_databaseTags.append(ti); } //if(isArabic) // ret = query.exec("SELECT cm.id_company, name_searchword_ar FROM company AS cm, searchword AS sw WHERE sw.id_company = cm.id_company AND cm.date_end > NOW() AND name_searchword_ar not like ''"); //else ret = query.exec("SELECT cm.id_company, name_searchword FROM company AS cm, searchword AS sw WHERE sw.id_company = cm.id_company AND cm.date_end > NOW()"); while( query.next() ) { int id = query.value(0).toInt(); QString tag = query.value(1).toString(); //QString::fromUtf8( query.value(1).toString().toAscii().data() ); TagInfo ti; ti.m_id = id; ti.m_name = tag; m_databaseTags.append(ti); } //m_databaseTags.sort(); //qSort(m_databaseTags.begin(), m_databaseTags.end()); if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of GetTagsFromDatabase() ---- " << endl; } QList emsOCR::SearchTags(bool isArabic) { if(m_debug) out << CurrentDateTime().toStdString() << " ---- SearchTags() ---- " << endl; QList tags; if(m_debug) out << "before GetTagsFromDatabase()" << endl; GetTagsFromDatabase(isArabic); if(m_debug) out << "after GetTagsFromDatabase(), number of tags is :" << m_databaseTags.count() << endl; clock_t start = clock(); for (int i = 0; i < m_databaseTags.count(); i++) { int id = m_databaseTags[i].m_id; QString tag = m_databaseTags[i].m_name; //if(m_debug) out << "searching for Tag (" << id << "): \n" << tag.toStdString(); QRegExp regEx(tag, Qt::CaseInsensitive); int count = 0; int pos = 0; while( (pos = regEx.indexIn(m_pageText, pos) ) != -1) { //if(m_pageText.indexOf(QRegExp(tag), 0 ) != -1){ QString foundTag = m_pageText.mid(pos, regEx.matchedLength() ); pos += regEx.matchedLength(); //check if this is a multi-word tag, if so, tell the function bool ismultiword = false; if(foundTag.indexOf(" ") != -1) ismultiword = true; QStringList coorVector = GetTagCoordinates(foundTag, ismultiword); if(coorVector.count() > 0) { ////cout << "\t'" << tag << "' is a TAG with coordinate(s): " << endl; for(int count=0; count < coorVector.count(); count++) { QString str = coorVector[count]; //since its a multiword, we need to convert the multiple boxes surronding each word to a single box if(ismultiword) ConvertToSingleBox(str, isArabic); ////cout << "\t\t---" << str << endl << endl; Tag t(id, str, foundTag); Tag::AddTag(tags, t); } } } } //double diff = ( clock() - start ) / (double)CLOCKS_PER_SEC; //cout << "1- Time took to search for keywords is " << diff << endl; if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of SearchTags() ---- " << endl; return tags; } void emsOCR::DeleteFromDatabase(int id_section_pages) { if(m_debug) out << CurrentDateTime().toStdString() << " ---- DeleteFromDatabase() ---- " << endl; QSqlQuery query(m_database); //DELETE page text query.prepare("DELETE FROM page_text WHERE id_section_pages = :id_section_pages"); query.bindValue(":id_section_pages", id_section_pages); bool ret = query.exec( ); query.prepare("DELETE FROM page_tag_coordinates WHERE id_section_pages = :id_section_pages"); query.bindValue(":id_section_pages", id_section_pages); ret = query.exec( ); QSqlQuery query2(m_ocrdatabase); query2.prepare("DELETE FROM page_word_coordinates WHERE id_section_pages = :id_section_pages"); query2.bindValue(":id_section_pages", id_section_pages); ret = query2.exec( ); if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of DeleteFromDatabase() ---- " << endl; } void emsOCR::AddToDatabase(PageInfo pi, QList &tags) { if(m_debug) out << CurrentDateTime().toStdString() << " ---- AddToDatabase() ---- " << endl; //first thing, we delete the previous entries from database DeleteFromDatabase(pi.m_sectionPagesId); QSqlQuery query( m_database ); //insert page text { //if(m_debug) out << "text is " << m_pageText.toUtf8().data() << endl << endl << endl; //QString qtext = "INSERT INTO page_text (`id_section_pages`, `text`) VALUES ("; //qtext += QString("%1").arg(id_section_pages); //qtext += ", " + m_pageText + ")"; m_pageText.remove("\""); m_pageText.remove("'"); m_pageText.remove("`"); query.prepare("INSERT INTO page_text (`id_section_pages`, `text`, id_publication, id_publication_type, id_country, publication_date) VALUES " "(:id_section_pages, :pagetext, :id_publication, :id_publication_type, :id_country, :publication_date)"); query.bindValue(":id_section_pages", pi.m_sectionPagesId); query.bindValue(":pagetext", m_pageText ); query.bindValue(":id_publication", pi.m_publicationId); query.bindValue(":id_publication_type", pi.m_publicationType); query.bindValue(":id_country", pi.m_country); query.bindValue(":publication_date", pi.m_publicationDate); if ( !query.exec() ) { if(m_debug) out << "AddToDatabase() Error happend, because of: " << query.lastError().text().toStdString(); if(m_debug) out << m_pageText.toStdString(); } } QSqlQuery query2( m_ocrdatabase ); if(m_debug) out << " trying to insert " << tags.count() << " tags to page_tag_coordinates table" << endl; for(int i=0; i < tags.count(); i++) { Tag t = tags[i]; //ostringstream os; query2.prepare("INSERT INTO page_tag_coordinates (`id_section_pages`, `id_company`, `coordinates`, `tag`) VALUES (:id_section_pages, :id, :coord, :tag)"); query2.bindValue(":id_section_pages" , pi.m_sectionPagesId); query2.bindValue(":id" , t.id() ); query2.bindValue(":coord" , t.coordinates() ); query2.bindValue(":tag" , t.tag() ); bool ret = query2.exec( ); if(!ret) { if(m_debug) { out << "Error inserting to page_tag_coordinates: " << query2.lastError().text().toStdString() << endl; out << "id: " << t.id() << ", name: " << t.tag().toStdString() << " coordinates: " << t.coordinates().toStdString() << endl; out << "page_tag_coordinates query: " << query2.lastQuery().toStdString() << endl; } } } //now insert into the page_word_coordinates for(int i=0; i < m_words.count(); i++) { //QString word = m_words[i].word(); //if(m_debug) out << "word is: " << m_words[i].word() << endl; QString coord = m_words[i].FormatCoordinatesToExport(); query2.prepare("INSERT INTO page_word_coordinates (`id_section_pages`, `word`, `coordinates`) VALUES (:id_section_pages, :word, :coord)"); query2.bindValue(":id_section_pages", pi.m_sectionPagesId); query2.bindValue(":word", m_words[i].wordOriginal() ); query2.bindValue(":coord", coord); bool ret = query2.exec(); if(!ret) { if(m_debug) { out << "Error inserting to page_word_coordinates: " << query2.lastError().text().toStdString() << endl; out << "page_word_coordinates query: " << query2.lastQuery().toStdString() << endl; } } } if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of AddToDatabase() ---- " << endl; } QStringList emsOCR::GetTagCoordinates(QString tag, bool ismultiword) { //first phase is to look for a single word //clock_t start = clock(); QStringList locations; tag.replace("'s", ""); //remove the 's from the end of the keyword if(ismultiword) { QStringList tagParts = tag.split(" "); //then we search for them in sequence for(int i=0; i< m_words.size() - tagParts.count(); i++) { bool found = true; for(int j=0; j < tagParts.count(); j++) //dont check for the last word { QString word = m_words[i+j].word(); //removePunctuation(word); if( tagParts[j].compare(word, Qt::CaseInsensitive) == 0 ) continue; else { found = false; break; } } if(found) { //we get the location of the first and the last tags //Coordinates c(m_words[i].top(), m_words[i].left(), m_words[i+tagParts.size()-1].bottom(), m_words[i+tagParts.size()-1].right()); QString str = ""; for(int j=0; j< tagParts.count(); j++) { QString tmp = m_words[i+j].FormatCoordinatesToExport(); str += tmp; if( j != tagParts.size() -1) str+= ":"; } locations.append( str ); i += tagParts.count(); //hup over the m_words found } } } else { for(int i=0; i < m_words.size(); i++) { QString word = m_words[i].word(); if ( tag.compare(word, Qt::CaseInsensitive) == 0) { QString str = m_words[i].FormatCoordinatesToExport(); locations.append(str); } } } //double diff = ( clock() - start ) / (double)CLOCKS_PER_SEC; ////cout << "Time took to search for GetTagCoordinates is " << diff << endl; return locations; } void emsOCR::ConvertToSingleBox(QString &str, bool isArabic) { QStringList list = str.split(":"); vector crd; if(!isArabic) { for(int i=0; i< list.count(); i++) { QStringList crdStr = list[i].split(","); Coordinates c(crdStr[0].toInt(), crdStr[1].toInt(), crdStr[2].toInt() ,crdStr[3].toInt()); crdStr.clear(); crd.push_back(c); } } else { for(int i=list.count()-1 ; i> -1; i--) { QStringList crdStr = list[i].split(","); Coordinates c(crdStr[0].toInt(), crdStr[1].toInt(), crdStr[2].toInt() ,crdStr[3].toInt()); crdStr.clear(); crd.push_back(c); } } //now we have all the numbers in coordiantes, we do our work QString newstr = ""; ostringstream oss; for(size_t i=0; i< crd.size(); i++) { QString topleft; QString bottomright; oss.str(""); oss << crd[i].top() << "," << crd[i].left() << ","; topleft = QString::fromStdString( oss.str() ); oss.str(""); oss << crd[i].bottom() << "," << crd[i].right(); bottomright = QString::fromStdString( oss.str() ); for(size_t j=i+1; j< crd.size(); j++) { if( abs (crd[i].top() - crd[j].top()) < 6) // on the same line { oss.str(""); oss << crd[j].bottom() << "," << crd[j].right(); bottomright = QString::fromStdString( oss.str() ); crd.erase(crd.begin() + j); j--; } else break; } newstr += topleft + bottomright; if( i < crd.size() -1 ) newstr += ":"; } if(newstr.length() > 0) str = newstr; } void emsOCR::RemoveDashAndTheFollowingSpace(QString &text) { text = text.replace("- ", ""); } QString emsOCR::CurrentDateTime() { QDateTime now = QDateTime::currentDateTime(); return now.toString(QString("yy-MM-dd hh:mm:ss")); } //* void emsOCR::ShowException ( IDRSException & theIDRSException ) { if(m_debug) out << "An error occured in the iDRS." << endl; if(m_debug) out << "Code " << theIDRSException.m_code << endl; if(m_debug) out << "File " << theIDRSException.m_strSrcFile << endl; if(m_debug) out << "Line " << theIDRSException.m_uiSrcLine << endl; switch ( theIDRSException.m_code ) { case IDRS_ERROR_INVALID_ARGS: if(m_debug) out << "The set of parameters provided is not supported. Please check the parameters." << endl; break; case IDRS_ERROR_FILE_OPEN: if(m_debug) out <<"Unable to load the specified file. Please check its path." << endl; break; case IDRS_ERROR_NO_IMAGING_MODULE_READY: if(m_debug) out <<"No imaging module is ready. An imaging module is necessary to load/save image files." << endl; break; case IDRS_ERROR_DRS_ASIAN_NOT_READY: if(m_debug) out <<"The Asian OCR add-on is not ready." << endl; break; case IDRS_ERROR_DRS_HEBREW_NOT_READY: if(m_debug) out <<"The Hebrew OCR add-on is not ready." << endl; break; case IDRS_ERROR_DRS_ICR_NOT_READY: if(m_debug) out <<"The ICR module is not ready." << endl; break; case IDRS_ERROR_DRS_BANKING_FONTS_NOT_READY: if(m_debug) out <<"The banking fonts recognition add-on is not ready." << endl; break; case IDRS_ERROR_DRS_ARABIC_NOT_READY: if(m_debug) out <<"The Arabic OCR add-on is not ready." << endl; break; default: if(m_debug) out <<"General Error occured." << endl; break; } } //*/ void emsOCR::SetPagesInView() { m_ocrDialog->ui->queueWidget->clear(); QString p = m_pagesToProcess[m_currentIndex].m_customName; m_ocrDialog->ui->currentLineEdit->setText(p); for(int i=0; i< m_pagesToProcess.count(); i++) { if(m_currentIndex == i) continue; if(m_pagesToProcess[i].m_isProcessed) continue; p = m_pagesToProcess[i].m_customName; QListWidgetItem *item = new QListWidgetItem(p); m_ocrDialog->ui->queueWidget->addItem(item); } } void emsOCR::ClearPagesInView() { m_ocrDialog->ui->queueWidget->clear(); m_ocrDialog->ui->currentLineEdit->setText(""); } void emsOCR::AddPageToProcessedTable(bool successful, QString date, QString page, double time, int keywordCount) { QTableWidgetItem *itemDate = new QTableWidgetItem( date ); QTableWidgetItem *itemName = new QTableWidgetItem( page ); QTableWidgetItem *itemTime = new QTableWidgetItem( QString("%1").arg(time) ); QTableWidgetItem *itemKWCount = new QTableWidgetItem( QString("%1").arg(keywordCount) ); int row = m_ocrDialog->ui->processedTableWidget->rowCount(); m_ocrDialog->ui->processedTableWidget->insertRow(row); m_ocrDialog->ui->processedTableWidget->setItem(row, 0, itemDate); m_ocrDialog->ui->processedTableWidget->setItem(row, 1, itemName); m_ocrDialog->ui->processedTableWidget->setItem(row, 2, itemTime); m_ocrDialog->ui->processedTableWidget->setItem(row, 3, itemKWCount); } void emsOCR::IncrementTotalProcssedPages() { m_ocrDialog->m_totalNumberOfProcessedPages++; m_ocrDialog->ui->totalPagesLabel->setText(QString("Total Pages Processed: %1").arg(m_ocrDialog->m_totalNumberOfProcessedPages)); } void emsOCR::UpdateOCRStatus(int id_publication, int id_publication_issue, int latest_page_number) { if(m_debug) out << CurrentDateTime().toStdString() << "\t---- UpdateOCRStatus()" << endl; static bool updateOperation = false; QSqlQuery query(m_database); if(!updateOperation) { query.prepare("select * from emsOCR_status where id_emsOCR = :id"); query.bindValue(":id", m_id); query.exec(); if(query.next()) updateOperation = true; } if(!updateOperation) { query.prepare("INSERT INTO emsOCR_status (id_emsOCR, latest_query_time, id_publication, id_publication_issue, latest_page_number ) VALUES (" ":id_emsOCR, NOW(), :id_publication, :id_publication_issue, :latest_page_number)"); } else { query.prepare("UPDATE emsOCR_status SET latest_query_time = NOW(), " " id_publication = :id_publication, id_publication_issue = :id_publication_issue, latest_page_number = :latest_page_number " " WHERE id_emsOCR = :id_emsOCR"); } query.bindValue(":id_emsOCR", m_id); query.bindValue(":id_publication", id_publication); query.bindValue(":id_publication_issue", id_publication_issue); query.bindValue(":latest_page_number", latest_page_number); bool ret = query.exec(); if(!ret) { if(m_debug) out << CurrentDateTime().toStdString() << "\tCall to sql query returned false" << endl; if(m_debug) out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().text().toStdString() << endl; if(m_debug) { out << "query executed: " << query.lastQuery().toStdString() << endl; out << "parameters used: " << endl; QMapIterator i(query.boundValues()); while (i.hasNext()) { i.next(); out << i.key().toAscii().data() << ": " << i.value().toString().toAscii().data() << endl; } } } else updateOperation = true; if(m_debug) out << CurrentDateTime().toStdString() << " ---- End of UpdateOCRStatus() ---- " << endl; }