1150 lines
32 KiB
C++
1150 lines
32 KiB
C++
#include "idrs_config.h"
|
|
#include "OurOCREngine.h"
|
|
|
|
#include <QtGui/QtGui>
|
|
#include <QtCore/QtCore>
|
|
|
|
#include "qtservice.h"
|
|
|
|
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <vector>
|
|
#include <cstdio>
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
#include <cctype>
|
|
#include <exception>
|
|
#include <cstdlib>
|
|
#include <ctime>
|
|
#include <fstream>
|
|
|
|
#include "Coordinates.h"
|
|
#include "Word.h"
|
|
#include "Tag.h"
|
|
|
|
using namespace std;
|
|
|
|
ofstream out;
|
|
bool stop = false;
|
|
QMutex mutex;
|
|
|
|
IDRS_FILE_SETUP_INFO gSampleSetupInfo;
|
|
|
|
#define MAX_PAGES 5
|
|
|
|
|
|
//bool OurOCREngine::m_stop = false;
|
|
OurOCREngine::OurOCREngine()
|
|
{
|
|
out.open("C:/ems-log.txt", ios::app);
|
|
//m_mutex = new QMutex(QMutex::NonRecursive);
|
|
//m_stop = false;
|
|
stop = false;
|
|
}
|
|
|
|
OurOCREngine::~OurOCREngine()
|
|
{
|
|
CIDRS::Unload (); //unload the module
|
|
//if(m_mutex)
|
|
// delete m_mutex;
|
|
}
|
|
|
|
void Start(OurOCREngine &engine)
|
|
{
|
|
out << engine.CurrentDateTime().toStdString() << "\tStart the process" << endl;
|
|
engine.Process();
|
|
}
|
|
|
|
/*void OurOCREngine::Pause()
|
|
{
|
|
m_pause = true;
|
|
}*/
|
|
|
|
void OurOCREngine::Stop()
|
|
{
|
|
//out << "----------Stopping the process" << endl;
|
|
QMutexLocker locker(&mutex);
|
|
stop = true;
|
|
//out << "----------Stop value is " << stop << endl;
|
|
}
|
|
|
|
|
|
bool OurOCREngine::ReadSettings()
|
|
{
|
|
QString config = QApplication::applicationDirPath()+ "/config.ini";
|
|
QSettings settings(config, QSettings::IniFormat);
|
|
|
|
m_server = settings.value("database/server").toString(); //read database's server value from settings
|
|
m_databaseName = settings.value("database/database").toString(); //read database's database name value from settings
|
|
m_user = settings.value("database/uid").toString(); //read database's user id value from settings
|
|
m_password = settings.value("database/pwd").toString(); //read database's password value from settings
|
|
m_port = settings.value("database/port").toInt(); //read database's port value from settings
|
|
|
|
// if any string is empty return false, else return true
|
|
/*if(m_masterDatabaseName.isEmpty() || m_masterDatabaseHost.isEmpty() || m_masterDatabaseUser.isEmpty() ||
|
|
m_masterDatabasePassword.isEmpty() || ( m_masterDatabasePort == 0) )
|
|
{
|
|
QMessageBox::critical(this, tr("config error"),
|
|
tr("Error reading 'config.ini' master database configurations\nMake sure the file exists, and its properly formatted!"));
|
|
return false;
|
|
}*/
|
|
|
|
return true;
|
|
}
|
|
|
|
bool OurOCREngine::Initalize()
|
|
{
|
|
IDRS_FILE_SETUP_INFO aSampleSetupInfo;
|
|
|
|
m_isReady = false;
|
|
|
|
/* Setup iDRS */
|
|
|
|
try
|
|
{
|
|
if ( SetupIDRS ( aSampleSetupInfo ) != 0 )
|
|
{
|
|
out << CurrentDateTime().toStdString() << "\tUnable to setup iDRS." << endl;
|
|
exit(0); //return false;
|
|
}
|
|
else
|
|
{
|
|
m_isReady = true;
|
|
}
|
|
}
|
|
catch ( IDRSException & aIDRSException )
|
|
{
|
|
CIDRS::Unload ();
|
|
ShowException ( aIDRSException );
|
|
exit(0);//return false;
|
|
}
|
|
|
|
// Create the iDRS objects
|
|
try
|
|
{
|
|
m_reader = CReader::Create ();
|
|
}
|
|
catch ( IDRSException & aIDRSException )
|
|
{
|
|
m_reader = NULL;
|
|
|
|
m_isReady = false;
|
|
ShowException ( aIDRSException );
|
|
CIDRS::Unload ();
|
|
exit(0);
|
|
//return false;
|
|
}
|
|
|
|
if (!ReadSettings() )
|
|
{
|
|
out << CurrentDateTime().toStdString() << "\terror reading settings" << endl;
|
|
m_isReady = false;
|
|
exit(0);
|
|
//return false;
|
|
|
|
}
|
|
if (!ConnectToDatabase() )
|
|
{
|
|
//erro handling show be done here
|
|
out << CurrentDateTime().toStdString() << "\tCould not connect to DB" << endl;
|
|
m_isReady = false;
|
|
exit(0);
|
|
//return false;
|
|
}
|
|
|
|
GetIssuePath();
|
|
|
|
return m_isReady;
|
|
}
|
|
|
|
void OurOCREngine::Process()
|
|
{
|
|
while(true)
|
|
{
|
|
//read pages to process
|
|
//out << "Stop value is " << stop << endl;
|
|
//{
|
|
/*
|
|
QMutexLocker locker(&mutex);
|
|
if( stop )
|
|
{
|
|
out << CurrentDateTime().toStdString() << "\tStopping the service" << endl;
|
|
ResetUnprocessedPages(); //unset the pages to process in the queue
|
|
CIDRS::Unload();
|
|
m_reader = NULL;
|
|
return; //and exit
|
|
}
|
|
}*/
|
|
|
|
GetPagesToProcess();
|
|
if(m_pagesToProcess.size() == 0)
|
|
{
|
|
//no pages to process, sleep for 2 seconds
|
|
//QtServiceBase::instance()->logMessage("No pages to process, sleeping for 2 seconds");
|
|
out << CurrentDateTime().toStdString() << "\tNo pages to process, sleeping for 2 seconds\r";
|
|
Sleep(2*1000);
|
|
continue;
|
|
}
|
|
|
|
for(int i=0; i< m_pagesToProcess.size(); i++)
|
|
{
|
|
{
|
|
QMutexLocker locker(&mutex);
|
|
if( stop )
|
|
break;
|
|
}
|
|
|
|
out << CurrentDateTime().toStdString() << " # of pages to process: " << m_pagesToProcess.size() << ", now processing page # " << i+1 << endl;
|
|
|
|
if( !m_pagesToProcess[i].m_isCopied )
|
|
{
|
|
out << CurrentDateTime().toStdString() << "\tPage " << m_pagesToProcess[i].m_localPath.toStdString() << " could not be copied, skipping to next page" << endl;
|
|
continue; //if its not copied, then we just move to the next copied page
|
|
}
|
|
|
|
m_currentIndex = i;
|
|
QString pagePath = m_pagesToProcess[i].m_localPath;
|
|
//QtServiceBase::instance()->logMessage( QString("start process of page %1").arg(pagePath) );
|
|
out << CurrentDateTime().toStdString() << "\tStart process of page " << pagePath.toStdString() << endl;
|
|
|
|
bool ret = ProcessPage(pagePath, m_pagesToProcess[i].m_isArabic);
|
|
if(ret)
|
|
{
|
|
//we have a successfull processing, we do the insertion of the database, and updating of the status
|
|
QList<Tag> tags = SearchTags( m_pagesToProcess[i].m_isArabic );
|
|
AddToDatabase(m_pagesToProcess[i].m_sectionPagesId, tags);
|
|
|
|
//clear text
|
|
m_pageText = "";
|
|
m_words.clear();
|
|
|
|
//QtServiceBase::instance()->logMessage( QString("Process of page %1 completed successfully").arg(pagePath) );
|
|
out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " completed successfully" << endl;
|
|
m_pagesToProcess[i].m_isProcessed = true;
|
|
}
|
|
else
|
|
{
|
|
//we need to re-process it another time
|
|
//QtServiceBase::instance()->logMessage( QString("Process of page %1 unsuccessfull, re-process one more time").arg(pagePath) );
|
|
out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " unsuccessfull, re-process one more time" << endl;
|
|
|
|
bool ret = ProcessPage(pagePath, m_pagesToProcess[i].m_isArabic);
|
|
if(ret)
|
|
{
|
|
//we have a successfull processing, we do the insertion of the database, and updating of the status
|
|
QList<Tag> tags = SearchTags( m_pagesToProcess[i].m_isArabic );
|
|
AddToDatabase(m_pagesToProcess[i].m_sectionPagesId, tags);
|
|
|
|
//clear text
|
|
m_pageText = "";
|
|
m_words.clear();
|
|
|
|
//QtServiceBase::instance()->logMessage( QString("Process of page %1 completed successfully").arg(pagePath) );
|
|
out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " completed successfully" << endl;
|
|
m_pagesToProcess[i].m_isProcessed = true;
|
|
}
|
|
else
|
|
{
|
|
//we need to re-process it another time
|
|
//QtServiceBase::instance()->logMessage( QString("Process of page %1 unsuccessfull, re-process one more time").arg(pagePath) );
|
|
out << CurrentDateTime().toStdString() << "\tProcess of page " << pagePath.toStdString() << " unsuccessfull, SKIPPING this page" << endl;
|
|
}
|
|
}
|
|
}
|
|
|
|
//now delete the copied pages
|
|
DeleteFiles();
|
|
//and reset the pages that have not been processed to be processed.
|
|
//ResetUnprocessedPages();
|
|
}
|
|
}
|
|
|
|
bool OurOCREngine::ConnectToDatabase()
|
|
{
|
|
m_database = QSqlDatabase::addDatabase("QMYSQL");
|
|
|
|
// initalize the connection
|
|
m_database.setHostName(m_server);
|
|
m_database.setPort(m_port);
|
|
m_database.setDatabaseName(m_databaseName);
|
|
m_database.setUserName(m_user);
|
|
m_database.setPassword(m_password);
|
|
|
|
// try to open the connection, if ok returns true, else returns false
|
|
bool ok = m_database.open();
|
|
if(!ok)
|
|
{
|
|
// if error, get the error from the database text and print it to debug screen
|
|
QString str = m_database.lastError().databaseText();
|
|
//QMessageBox::critical(this, tr("Database Error"), tr("Error connecting to master database\n") + str);
|
|
cout << str.toStdString();
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void OurOCREngine::GetIssuePath()
|
|
{
|
|
QString issueQuery = "SELECT issues_path, issues_windows_drive_letter FROM system_configuration";
|
|
QSqlQuery query(m_database);
|
|
|
|
query.exec(issueQuery);
|
|
|
|
QString str = query.lastError().databaseText();
|
|
|
|
while( query.next() )
|
|
{
|
|
QString path = query.value(0).toString();
|
|
QString drive = query.value(1).toString();
|
|
|
|
m_issuePath = drive + path;
|
|
}
|
|
}
|
|
|
|
void OurOCREngine::ResetUnprocessedPages()
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- ResetUnprocessedPages() ---- " << endl;
|
|
QString lockQuery = "LOCK TABLE section_pages WRITE";
|
|
QString unlockQuery = "UNLOCK TABLES";
|
|
QString updateQuery = "UPDATE section_pages SET status = 1 WHERE section_pages = :section_pages";
|
|
|
|
QSqlQuery query(m_database);
|
|
|
|
//first lock the page.
|
|
query.exec( lockQuery );
|
|
|
|
//now change the status of the pages to being processed
|
|
for(int i=0; i< m_pagesToProcess.size(); i++)
|
|
{
|
|
if(!m_pagesToProcess[i].m_isProcessed)
|
|
{
|
|
query.prepare( updateQuery );
|
|
query.bindValue(":section_pages", m_pagesToProcess[i].m_sectionPagesId);
|
|
bool ret = query.exec();
|
|
}
|
|
}
|
|
|
|
//now unlock the pages
|
|
query.exec( unlockQuery );
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of ResetUnprocessedPages() ---- " << endl;
|
|
}
|
|
|
|
void OurOCREngine::GetPagesToProcess()
|
|
{
|
|
out << CurrentDateTime().toStdString() << "\t---- GetPagesToProcess()" << endl;
|
|
//first of all, try to lock the dummy table
|
|
//then read the pages that needs to be processed, and get the first n pages (n is determined in the config files)
|
|
//change their status to being processed
|
|
//unlock the dummy table
|
|
QString lockQuery = "LOCK TABLE section_pages WRITE";
|
|
QString unlockQuery = "UNLOCK TABLES";
|
|
QString selectQuery = "SELECT section_pages, page_name FROM section_pages WHERE status = 1 LIMIT 5";
|
|
QString updateQuery = "UPDATE section_pages SET status = 2 WHERE section_pages = :section_pages";
|
|
|
|
m_pagesToProcess.clear();
|
|
|
|
//QSqlQuery query = m_database.query();
|
|
QSqlQuery query(m_database);
|
|
|
|
//first lock the page.
|
|
out << CurrentDateTime().toStdString() << "\t--->lock query: \"" << lockQuery.toStdString() << "\"" << endl;
|
|
out << CurrentDateTime().toStdString() << "\t\tcall lock query ";
|
|
bool ret = query.exec( lockQuery );
|
|
if(!ret)
|
|
{
|
|
out << " returned false" << endl;
|
|
out << CurrentDateTime().toStdString() << "\t\treason: " << query.lastError().databaseText().toStdString() << endl;
|
|
return;
|
|
}
|
|
else
|
|
out << " returned true" << endl;
|
|
|
|
|
|
out << CurrentDateTime().toStdString() << "\tSelect query: \"" << selectQuery.toStdString() << "\"" << endl;
|
|
out << CurrentDateTime().toStdString() << "\tCall select query " ;
|
|
ret = query.exec( selectQuery );
|
|
if(!ret)
|
|
{
|
|
out << " returned false" << endl;
|
|
out << CurrentDateTime().toStdString() << "\t\treason: " << query.lastError().databaseText().toStdString() << endl;
|
|
return;
|
|
}
|
|
else
|
|
out << " returned true" << endl;
|
|
|
|
out << CurrentDateTime().toStdString() << "\tNumber of rows returned " << query.size() << endl;
|
|
|
|
if( query.size() > 0 )
|
|
{
|
|
while ( query.next() )
|
|
{
|
|
//if( i >= MAX_PAGES) //break;
|
|
|
|
QString sectionPages = query.value(0).toString();
|
|
QString pageName = query.value(1).toString();
|
|
|
|
out << "\t\t\tAdd " << pageName.toStdString() << " to list of pages to be processed" << endl;
|
|
|
|
PageInfo pi;
|
|
pi.m_sectionPagesId = sectionPages.toInt();
|
|
pi.m_remotePath = pageName;
|
|
pi.m_localPath = pageName;
|
|
|
|
m_pagesToProcess.append(pi);
|
|
}
|
|
|
|
//now change the status of the pages to being processed
|
|
for(int i=0; i< m_pagesToProcess.size(); i++)
|
|
{
|
|
query.prepare( updateQuery );
|
|
query.bindValue(":section_pages", m_pagesToProcess[i].m_sectionPagesId);
|
|
bool ret = query.exec();
|
|
if(!ret)
|
|
{
|
|
out << CurrentDateTime().toStdString() << "\tCall to update query returned false" << endl;
|
|
out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().databaseText().toStdString() << endl;
|
|
return;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
//now unlock the pages
|
|
out << CurrentDateTime().toStdString() << "\tUnlock query: \"" << unlockQuery.toStdString() << "\"" << endl;
|
|
out << CurrentDateTime().toStdString() << "\tCall unlock query ";
|
|
ret = query.exec( unlockQuery );
|
|
if(!ret)
|
|
{
|
|
out << " returned false" << endl;
|
|
out << CurrentDateTime().toStdString() << "\t\tReason: " << query.lastError().databaseText().toStdString() << endl;
|
|
return;
|
|
}
|
|
else
|
|
out << " returned true" << endl;
|
|
|
|
|
|
if( m_pagesToProcess.size() > 0 )
|
|
ConstructPagesPath();
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of GetPagesToProcess() ---- " << endl;
|
|
|
|
}
|
|
|
|
void OurOCREngine::ConstructPagesPath()
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- ConstructPagesPath() ---- " << endl;
|
|
QSqlQuery query (m_database);
|
|
|
|
for(int i = 0; i < m_pagesToProcess.size(); i++)
|
|
{
|
|
ostringstream oss;
|
|
oss << "SELECT issue_date, pi.id_publication, id_section, `language` FROM "
|
|
<< " publication as pi, publication_issue as pis, issue_sections as ise, section_pages as sp "
|
|
<< " WHERE "
|
|
<< " pi.id_publication = pis.id_publication AND "
|
|
<< " pis.id_publication_issue = ise.id_publication_issue AND "
|
|
<< " ise.id_issue_sections = sp.id_issue_sections AND "
|
|
<< " section_pages = " << m_pagesToProcess[i].m_sectionPagesId;
|
|
|
|
QString str = QString::fromStdString( oss.str() );
|
|
query.exec( str );
|
|
|
|
while(query.next() )
|
|
{
|
|
QString issueDate = query.value(0).toString();
|
|
QString publicationID = query.value(1).toString();
|
|
QString sectionID = query.value(2).toString();
|
|
QString language = query.value(3).toString();
|
|
|
|
//path is in the format of: Fileserver_path_Issues/YYYY/MM/DD/ID_PUBLICATION/ID_SECTION/
|
|
QStringList dateVecor = issueDate.split("-");
|
|
QString remotepath = QString("%1%2\\%3\\%4\\%5\\%6\\300dpi\\%7")
|
|
.arg(m_issuePath)
|
|
.arg(dateVecor[0])
|
|
.arg(dateVecor[1])
|
|
.arg(dateVecor[2])
|
|
.arg(publicationID)
|
|
.arg(sectionID)
|
|
.arg(m_pagesToProcess[i].m_remotePath);
|
|
remotepath.replace("/", "\\");
|
|
|
|
//m_issuePath + dateVecor[0] + "/" + dateVecor[1] + "/" + dateVecor[2] + "/" + publicationID + "/" + sectionID + "/300dpi/" + m_pagesToProcess[i].m_remotePath;
|
|
|
|
QString localpath = QString("C:\\%1_%2_%3").arg(publicationID).arg(sectionID).arg(m_pagesToProcess[i].m_localPath);
|
|
|
|
m_pagesToProcess[i].m_remotePath = remotepath;
|
|
m_pagesToProcess[i].m_localPath = localpath;
|
|
bool isArabic = ( language.compare("Arabic", Qt::CaseInsensitive) == 0 );
|
|
m_pagesToProcess[i].m_isArabic = isArabic;
|
|
m_pagesToProcess[i].m_isCopied = false;
|
|
m_pagesToProcess[i].m_isProcessed = false;
|
|
}
|
|
}
|
|
|
|
//copy Files from remoate path to local path
|
|
CopyFiles();
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of ConstructPagesPath() ---- " << endl;
|
|
}
|
|
|
|
void OurOCREngine::CopyFiles()
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- CopyFile() ---- " << endl;
|
|
|
|
for(int i=0; i < m_pagesToProcess.count(); i++)
|
|
{
|
|
QFile file;
|
|
bool ret = file.copy(m_pagesToProcess[i].m_remotePath, m_pagesToProcess[i].m_localPath);
|
|
int j = 0;
|
|
while(!ret && j < 5)
|
|
{
|
|
ret = file.copy(m_pagesToProcess[i].m_remotePath, m_pagesToProcess[i].m_localPath);
|
|
j++;
|
|
}
|
|
|
|
//check the file is exists
|
|
ret = file.exists(m_pagesToProcess[i].m_localPath);
|
|
|
|
out << "\tCopy of file " << m_pagesToProcess[i].m_remotePath.toStdString() << " to " << m_pagesToProcess[i].m_localPath.toStdString() <<" is ";
|
|
|
|
if(ret)
|
|
{
|
|
m_pagesToProcess[i].m_isCopied = true;
|
|
out << "successfull" << endl;
|
|
}
|
|
else
|
|
{
|
|
m_pagesToProcess[i].m_isCopied = false;
|
|
out << "un-successfull" << endl;
|
|
out << "\tReason: " << file.errorString().toStdString() << endl;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of CopyFile() ---- " << endl;
|
|
}
|
|
|
|
void OurOCREngine::DeleteFiles()
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- DeleteFiles() ---- " << endl;
|
|
for(int i=0; i < m_pagesToProcess.count(); i++)
|
|
{
|
|
bool ret = QFile::remove(m_pagesToProcess[i].m_localPath);
|
|
int j = 0;
|
|
while(!ret && j < 5)
|
|
{
|
|
ret = QFile::remove(m_pagesToProcess[i].m_localPath);
|
|
j++;
|
|
}
|
|
}
|
|
out << CurrentDateTime().toStdString() << " ---- End of DeleteFiles() ---- " << endl;
|
|
}
|
|
bool OurOCREngine::ProcessPage(QString &pagePath, bool isArabic)
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- ProcessPage() ---- " << endl;
|
|
|
|
IDRS::CPage aPage;
|
|
|
|
try
|
|
{
|
|
if(isArabic)
|
|
{
|
|
COCRContext aOCRContext = COCRContext::Create (IDRS_LNG_ARABIC);
|
|
m_reader.SetOCRParameters(aOCRContext);
|
|
}
|
|
else
|
|
{
|
|
COCRContext aOCRContext = COCRContext::Create (IDRS_LNG_ENGLISH);
|
|
m_reader.SetOCRParameters(aOCRContext);
|
|
}
|
|
|
|
|
|
// Load the page
|
|
aPage = IDRS::CPage::Create ();
|
|
|
|
std::wstring page = pagePath.toStdWString();
|
|
aPage.LoadSourceImage ( (IDRS::IDRS_CTSTR) page.c_str() );
|
|
|
|
// Do the recognition
|
|
m_reader.Read ( aPage );
|
|
}
|
|
catch ( IDRS::IDRSException & aIDRSException )
|
|
{
|
|
ShowException ( aIDRSException );
|
|
|
|
aPage = NULL;
|
|
out << CurrentDateTime().toStdString() << "\t Error Loading page: " << pagePath.toStdString() << endl;
|
|
//IDRS::CIDRS::Unload ();
|
|
return false;
|
|
}
|
|
|
|
try
|
|
{
|
|
IDRS::CZone aZone;
|
|
IDRS::CZoneMeaning aZoneMeaning;
|
|
IDRS::IDRS_RECT aRect;
|
|
IDRS::CMeaningElt aMeaningElt;
|
|
IDRS::CMeaningEltFmtNfo aMeaningEltFmtNfo;
|
|
IDRS::CMeaningEltTxt aMeaningEltTxt;
|
|
IDRS::IDRS_UINT
|
|
ui,
|
|
uiZoneIndex, uiMeaningEltIndex;
|
|
|
|
IDRS::IDRS_BOOL
|
|
bFirstLine, bFirstWordInLine;
|
|
|
|
bFirstLine = bFirstWordInLine = IDRS_TRUE;
|
|
|
|
IDRS::IDRS_RECT rect;
|
|
|
|
wstring pageText;
|
|
|
|
for ( uiZoneIndex = 0; uiZoneIndex < aPage.GetZoneCount (); uiZoneIndex ++ )
|
|
{
|
|
aZone = aPage.GetZoneAt ( uiZoneIndex );
|
|
|
|
// Writes zone header
|
|
|
|
aRect = aZone.GetBoundingRect ();
|
|
|
|
aZoneMeaning = aZone.Meaning ();
|
|
wstring str;
|
|
|
|
int itop, ileft, ibottom, iright;
|
|
|
|
if ( ! aZoneMeaning.IsNull ())
|
|
{
|
|
for ( uiMeaningEltIndex = 0; uiMeaningEltIndex < aZoneMeaning.GetCount (); uiMeaningEltIndex ++ )
|
|
{
|
|
|
|
aMeaningElt = aZoneMeaning.GetAt ( uiMeaningEltIndex );
|
|
|
|
|
|
switch ( aMeaningElt.GetMEType ())
|
|
{
|
|
|
|
case IDRS_ME_TEXT:
|
|
aMeaningEltTxt = aMeaningElt;
|
|
|
|
for ( ui = 0; ui < aMeaningEltTxt.GetLength (); ui ++ )
|
|
{
|
|
str += (wchar_t) aMeaningEltTxt.GetBuffer()[ui];
|
|
pageText += (wchar_t) aMeaningEltTxt.GetBuffer()[ui];
|
|
}
|
|
|
|
break;
|
|
|
|
case IDRS_ME_FMT_NFO:
|
|
aMeaningEltFmtNfo = aMeaningElt;
|
|
|
|
switch ( aMeaningEltFmtNfo.GetFmtNfoType ())
|
|
{
|
|
|
|
case IDRS_FMT_NFO_WORD_START:
|
|
aMeaningEltFmtNfo.GetFmtNfoValue(rect);
|
|
if ( ! bFirstWordInLine )
|
|
{
|
|
Word w(str, itop/3, ileft/3, ibottom/3, iright/3);
|
|
m_words.append(w);
|
|
|
|
str = L"";
|
|
pageText += L" ";
|
|
}
|
|
else
|
|
bFirstWordInLine = IDRS_FALSE;
|
|
|
|
itop = rect.uiTop;
|
|
ileft = rect.uiLeft;
|
|
ibottom = rect.uiBottom;
|
|
iright = rect.uiRight;
|
|
|
|
break;
|
|
|
|
case IDRS_FMT_NFO_LINE_START:
|
|
if ( ! bFirstLine )
|
|
{
|
|
pageText += L" ";
|
|
|
|
if(str.length() > 0)
|
|
{
|
|
Word w(str, itop/3, ileft/3, ibottom/3, iright/3);
|
|
m_words.append(w);
|
|
|
|
str = L"";
|
|
}
|
|
aMeaningEltFmtNfo.GetFmtNfoValue(rect);
|
|
itop = rect.uiTop;
|
|
ileft = rect.uiLeft;
|
|
ibottom = rect.uiBottom;
|
|
iright = rect.uiRight;
|
|
}
|
|
else
|
|
bFirstLine = IDRS_FALSE;
|
|
|
|
|
|
bFirstWordInLine = IDRS_TRUE;
|
|
break;
|
|
|
|
case IDRS_FMT_NFO_PARA_START :
|
|
pageText += L"\n";
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
//add the last word of the line
|
|
if(str.length() > 0)
|
|
{
|
|
Word w(str, itop/3, ileft/3, ibottom/3, iright/3);
|
|
m_words.append(w);
|
|
|
|
str = L"";
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
m_pageText = QString::fromStdWString( pageText );
|
|
|
|
RemoveDashAndTheFollowingSpace(m_pageText);
|
|
|
|
//now remove dash from the word_list as well, and appened the next word to it.
|
|
for(int i=0; i< m_words.count()-1; i++)
|
|
{
|
|
if( m_words[i].word().endsWith("-") )
|
|
{
|
|
|
|
m_words[i].appendToWord(m_words[i+1].word(), true);
|
|
m_words[i].AddCoordinates( m_words[i+1].coordinates() );
|
|
//now remove the i+1 entry
|
|
m_words.removeAt(i+1);
|
|
i--;
|
|
}
|
|
}
|
|
}
|
|
catch ( IDRS::IDRSException & aIDRSException )
|
|
{
|
|
ShowException ( aIDRSException );
|
|
// aReader = NULL;
|
|
// IDRS::CIDRS::Unload ();
|
|
out << CurrentDateTime().toStdString() << "\t Error Loading page: " << pagePath.toStdString() << endl;
|
|
return false;
|
|
}
|
|
catch ( ... )
|
|
{
|
|
//aReader = NULL; //IDRS::CIDRS::Unload ();
|
|
|
|
cout << "An error occured in the iDRS." << endl;
|
|
return false;
|
|
}
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of ProcessPage() ---- " << endl;
|
|
|
|
return true;
|
|
}
|
|
|
|
void OurOCREngine::GetTagsFromDatabase(bool isArabic)
|
|
{
|
|
//check if we need to get a newer version of the database
|
|
out << CurrentDateTime().toStdString() << " ---- GetTagsFromDatabase() ---- " << endl;
|
|
|
|
QSqlQuery query( m_database );
|
|
bool ret;
|
|
if(isArabic)
|
|
ret = query.exec( "SELECT id_company, name_company_ar as name_company FROM company WHERE date_end > NOW() AND name_company_ar not like ''" );
|
|
else
|
|
ret = query.exec( "SELECT id_company, name_company FROM company WHERE date_end > NOW()" );
|
|
|
|
if(ret)
|
|
m_databaseTags.clear();
|
|
|
|
while( query.next() )
|
|
{
|
|
int id = query.value(0).toInt();
|
|
QString tag = query.value(1).toString(); //QString::fromUtf8( query.value(1).toString().toAscii().data() );
|
|
|
|
TagInfo ti;
|
|
ti.m_id = id;
|
|
ti.m_name = tag;
|
|
|
|
m_databaseTags.append(ti);
|
|
}
|
|
|
|
//if(isArabic)
|
|
// ret = query.exec("SELECT cm.id_company, name_searchword_ar FROM company AS cm, searchword AS sw WHERE sw.id_company = cm.id_company AND cm.date_end > NOW() AND name_searchword_ar not like ''");
|
|
//else
|
|
ret = query.exec("SELECT cm.id_company, name_searchword FROM company AS cm, searchword AS sw WHERE sw.id_company = cm.id_company AND cm.date_end > NOW()");
|
|
|
|
while( query.next() )
|
|
{
|
|
int id = query.value(0).toInt();
|
|
QString tag = query.value(1).toString(); //QString::fromUtf8( query.value(1).toString().toAscii().data() );
|
|
TagInfo ti;
|
|
ti.m_id = id;
|
|
ti.m_name = tag;
|
|
|
|
m_databaseTags.append(ti);
|
|
}
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of GetTagsFromDatabase() ---- " << endl;
|
|
}
|
|
|
|
QList<Tag> OurOCREngine::SearchTags(bool isArabic)
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- SearchTags() ---- " << endl;
|
|
|
|
QList<Tag> tags;
|
|
|
|
GetTagsFromDatabase(isArabic);
|
|
|
|
cout << "Searching for keywords" << endl;
|
|
|
|
clock_t start = clock();
|
|
|
|
for (int i = 0; i < m_databaseTags.count(); i++)
|
|
{
|
|
int id = m_databaseTags[i].m_id;
|
|
QString tag = m_databaseTags[i].m_name;
|
|
|
|
QRegExp regEx(tag);
|
|
int count = 0;
|
|
int pos = 0;
|
|
while( (pos = regEx.indexIn(m_pageText, pos) ) != -1)
|
|
{
|
|
//if(m_pageText.indexOf(QRegExp(tag), 0 ) != -1){
|
|
QString foundTag = m_pageText.mid(pos, regEx.matchedLength() );
|
|
|
|
pos += regEx.matchedLength();
|
|
|
|
//check if this is a multi-word tag, if so, tell the function
|
|
bool ismultiword = false;
|
|
if(foundTag.indexOf(" ") != -1)
|
|
ismultiword = true;
|
|
|
|
QStringList coorVector = GetTagCoordinates(foundTag, ismultiword);
|
|
|
|
if(coorVector.count() > 0)
|
|
{
|
|
//cout << "\t'" << tag << "' is a TAG with coordinate(s): " << endl;
|
|
for(int count=0; count < coorVector.count(); count++)
|
|
{
|
|
QString str = coorVector[count];
|
|
//since its a multiword, we need to convert the multiple boxes surronding each word to a single box
|
|
if(ismultiword)
|
|
ConvertToSingleBox(str);
|
|
|
|
//cout << "\t\t---" << str << endl << endl;
|
|
Tag t(id, str);
|
|
Tag::AddTag(tags, t);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
double diff = ( clock() - start ) / (double)CLOCKS_PER_SEC;
|
|
cout << "1- Time took to search for keywords is " << diff << endl;
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of SearchTags() ---- " << endl;
|
|
|
|
return tags;
|
|
}
|
|
|
|
|
|
void OurOCREngine::DeleteFromDatabase(int id_section_pages)
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- DeleteFromDatabase() ---- " << endl;
|
|
|
|
QSqlQuery query(m_database);
|
|
|
|
//DELETE page text
|
|
query.prepare("DELETE FROM page_text WHERE id_section_pages = :id_section_pages");
|
|
query.bindValue(":id_section_pages", id_section_pages);
|
|
bool ret = query.exec( );
|
|
|
|
/*query.prepare("DELETE FROM page_tag_coordinates WHERE id_section_pages = :id_section_pages");
|
|
query.bindValue(":id_section_pages", id_section_pages);
|
|
ret = query.exec( );*/
|
|
|
|
//now insert into the page_word_coordinates
|
|
query.prepare("DELETE FROM page_word_coordinates WHERE id_section_pages = :id_section_pages");
|
|
query.bindValue(":id_section_pages", id_section_pages);
|
|
ret = query.exec( );
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of DeleteFromDatabase() ---- " << endl;
|
|
|
|
}
|
|
|
|
void OurOCREngine::AddToDatabase(int id_section_pages, QList<Tag> &tags)
|
|
{
|
|
out << CurrentDateTime().toStdString() << " ---- AddToDatabase() ---- " << endl;
|
|
|
|
//first thing, we delete the previous entries from database
|
|
DeleteFromDatabase(id_section_pages);
|
|
|
|
QSqlQuery query( m_database );
|
|
//insert page text
|
|
{
|
|
//out << "text is " << m_pageText.toUtf8().data() << endl << endl << endl;
|
|
//QString qtext = "INSERT INTO page_text (`id_section_pages`, `text`) VALUES (";
|
|
//qtext += QString("%1").arg(id_section_pages);
|
|
//qtext += ", " + m_pageText + ")";
|
|
m_pageText.remove("\"");
|
|
m_pageText.remove("'");
|
|
m_pageText.remove("`");
|
|
|
|
query.prepare("INSERT INTO page_text (`id_section_pages`, `text`) VALUES (:id_section_pages, :pagetext)");
|
|
query.bindValue(":id_section_pages", id_section_pages);
|
|
query.bindValue(":pagetext", m_pageText );
|
|
|
|
if ( !query.exec() )
|
|
{
|
|
out << "AddToDatabase() Error happend, because of: " << query.lastError().text().toStdString();
|
|
out << m_pageText.toStdString();
|
|
}
|
|
}
|
|
|
|
|
|
for(int i=0; i < tags.count(); i++)
|
|
{
|
|
Tag t = tags[i];
|
|
ostringstream os;
|
|
query.prepare("INSERT INTO page_tag_coordinates (`id_section_pages`, `id_company`, `coordinates`) VALUES (:id_section_pages, :tag, :coord)");
|
|
query.bindValue(":id_section_pages" , id_section_pages);
|
|
query.bindValue(":tag" , t.id() );
|
|
query.bindValue(":coord" , t.coordinates() );
|
|
|
|
bool ret = query.exec( );
|
|
}
|
|
|
|
//now insert into the page_word_coordinates
|
|
/*for(int i=0; i < m_words.count(); i++)
|
|
{
|
|
//QString word = m_words[i].word();
|
|
out << "word is: " << m_words[i].word().toUtf8().data() << endl;
|
|
|
|
QString coord = m_words[i].FormatCoordinatesToExport();
|
|
|
|
query.prepare("INSERT INTO page_word_coordinates (`id_section_pages`, `word`, `coordinates`) VALUES (:id_section_pages, :word, :coord)");
|
|
query.bindValue(":id_section_pages", id_section_pages);
|
|
query.bindValue(":word", m_words[i].word().toUtf8().data() );
|
|
query.bindValue(":coord", coord);
|
|
|
|
bool ret = query.exec();
|
|
}*/
|
|
|
|
out << CurrentDateTime().toStdString() << " ---- End of AddToDatabase() ---- " << endl;
|
|
}
|
|
|
|
QStringList OurOCREngine::GetTagCoordinates(QString tag, bool ismultiword)
|
|
{
|
|
//first phase is to look for a single word
|
|
//clock_t start = clock();
|
|
|
|
QStringList locations;
|
|
|
|
tag.replace("'s", ""); //remove the 's from the end of the keyword
|
|
|
|
if(ismultiword)
|
|
{
|
|
|
|
QStringList tagParts = tag.split(" ");
|
|
//then we search for them in sequence
|
|
for(int i=0; i< m_words.size() - tagParts.count(); i++)
|
|
{
|
|
bool found = true;
|
|
for(int j=0; j < tagParts.count(); j++) //dont check for the last word
|
|
{
|
|
QString word = m_words[i+j].word();
|
|
//removePunctuation(word);
|
|
if( tagParts[j].compare(word, Qt::CaseInsensitive) == 0 )
|
|
continue;
|
|
else
|
|
{
|
|
found = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(found)
|
|
{
|
|
//we get the location of the first and the last tags
|
|
//Coordinates c(m_words[i].top(), m_words[i].left(), m_words[i+tagParts.size()-1].bottom(), m_words[i+tagParts.size()-1].right());
|
|
QString str = "";
|
|
for(int j=0; j< tagParts.count(); j++)
|
|
{
|
|
QString tmp = m_words[i+j].FormatCoordinatesToExport();
|
|
str += tmp;
|
|
if( j != tagParts.size() -1)
|
|
str+= ":";
|
|
}
|
|
|
|
locations.append( str );
|
|
|
|
i += tagParts.count(); //hup over the m_words found
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(int i=0; i < m_words.size(); i++)
|
|
{
|
|
QString word = m_words[i].word();
|
|
|
|
if ( tag.compare(word, Qt::CaseInsensitive) == 0)
|
|
{
|
|
QString str = m_words[i].FormatCoordinatesToExport();
|
|
locations.append(str);
|
|
}
|
|
}
|
|
}
|
|
|
|
//double diff = ( clock() - start ) / (double)CLOCKS_PER_SEC;
|
|
//cout << "Time took to search for GetTagCoordinates is " << diff << endl;
|
|
|
|
return locations;
|
|
}
|
|
|
|
void OurOCREngine::ConvertToSingleBox(QString &str)
|
|
{
|
|
QStringList list = str.split(":");
|
|
|
|
vector<Coordinates> crd;
|
|
for(int i=0; i< list.count(); i++)
|
|
{
|
|
QStringList crdStr = list[i].split(",");
|
|
Coordinates c(crdStr[0].toInt(), crdStr[1].toInt(), crdStr[2].toInt() ,crdStr[3].toInt());
|
|
|
|
crdStr.clear();
|
|
crd.push_back(c);
|
|
}
|
|
|
|
//now we have all the numbers in coordiantes, we do our work
|
|
QString newstr = "";
|
|
ostringstream oss;
|
|
for(size_t i=0; i< crd.size(); i++)
|
|
{
|
|
QString topleft;
|
|
QString bottomright;
|
|
|
|
oss.str("");
|
|
oss << crd[i].top() << "," << crd[i].left() << ",";
|
|
topleft = QString::fromStdString( oss.str() );
|
|
|
|
oss.str("");
|
|
oss << crd[i].bottom() << "," << crd[i].right();
|
|
bottomright = QString::fromStdString( oss.str() );
|
|
|
|
for(size_t j=i+1; j< crd.size(); j++)
|
|
{
|
|
if( abs (crd[i].top() - crd[j].top()) < 6) // on the same line
|
|
{
|
|
oss.str("");
|
|
oss << crd[j].bottom() << "," << crd[j].right();
|
|
bottomright = QString::fromStdString( oss.str() );
|
|
|
|
crd.erase(crd.begin() + j);
|
|
j--;
|
|
}
|
|
else
|
|
break;
|
|
}
|
|
|
|
newstr += topleft + bottomright;
|
|
if( i < crd.size() -1 )
|
|
newstr += ":";
|
|
}
|
|
|
|
if(newstr.length() > 0)
|
|
str = newstr;
|
|
}
|
|
|
|
|
|
void OurOCREngine::RemoveDashAndTheFollowingSpace(QString &text)
|
|
{
|
|
QString searchString( "- " );
|
|
QString replaceString( "" );
|
|
|
|
//assert( searchString != replaceString );
|
|
text = text.replace("- ", "");
|
|
|
|
/*
|
|
QString::size_type pos = 0;
|
|
while ( (pos = text.find(searchString, pos)) != QString::npos ) {
|
|
//m_pageText.replace( pos, searchString.size(), replaceString );
|
|
text.erase( pos, searchString.size() );
|
|
pos++;
|
|
}
|
|
|
|
pos = 0;
|
|
while ( (pos = text.find_first_of("\"'", pos)) != QString::npos ) {//m_pageText.replace( pos, searchString.size(), replaceString );
|
|
text.erase( pos, sizeof(wchar_t) );
|
|
pos++;
|
|
}
|
|
*/
|
|
}
|
|
|
|
QString OurOCREngine::CurrentDateTime()
|
|
{
|
|
QDateTime now = QDateTime::currentDateTime();
|
|
return now.toString(QString("yy-MM-dd hh:mm:ss"));
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void OurOCREngine::ShowException ( IDRSException & theIDRSException )
|
|
{
|
|
out << "An error occured in the iDRS." << endl;
|
|
out << "Code " << theIDRSException.m_code << endl;
|
|
out << "File " << theIDRSException.m_strSrcFile << endl;
|
|
out << "Line " << theIDRSException.m_uiSrcLine << endl;
|
|
|
|
switch ( theIDRSException.m_code )
|
|
{
|
|
case IDRS_ERROR_INVALID_ARGS:
|
|
out << "The set of parameters provided is not supported. Please check the parameters." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_FILE_OPEN:
|
|
out <<"Unable to load the specified file. Please check its path." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_NO_IMAGING_MODULE_READY:
|
|
out <<"No imaging module is ready. An imaging module is necessary to load/save image files." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_DRS_ASIAN_NOT_READY:
|
|
out <<"The Asian OCR add-on is not ready." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_DRS_HEBREW_NOT_READY:
|
|
out <<"The Hebrew OCR add-on is not ready." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_DRS_ICR_NOT_READY:
|
|
out <<"The ICR module is not ready." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_DRS_BANKING_FONTS_NOT_READY:
|
|
out <<"The banking fonts recognition add-on is not ready." << endl;
|
|
break;
|
|
|
|
case IDRS_ERROR_DRS_ARABIC_NOT_READY:
|
|
out <<"The Arabic OCR add-on is not ready." << endl;
|
|
break;
|
|
}
|
|
}
|