Begin a C++ cli and remove the Python scripts
This commit is contained in:
		
							
								
								
									
										167
									
								
								addindex
									
									
									
									
									
								
							
							
						
						
									
										167
									
								
								addindex
									
									
									
									
									
								
							| @@ -1,167 +0,0 @@ | ||||
| #!/usr/bin/python3 | ||||
| import sqlite3 | ||||
| import os.path | ||||
| import sys | ||||
| import subprocess | ||||
| import zipfile | ||||
| import xml.etree.ElementTree | ||||
| import re | ||||
| import chardet | ||||
| import config | ||||
| from multiprocessing import Pool | ||||
|  | ||||
|  | ||||
|  | ||||
| class pagedata: | ||||
| 	page = 0 | ||||
| 	content = "" | ||||
| 	 | ||||
| 	 | ||||
| def singlepagelist(content): | ||||
| 	result = pagedata() | ||||
| 	result.content = content | ||||
| 	result.page = 0 | ||||
| 	l = list(); | ||||
| 	l.append(result) | ||||
| 	return l | ||||
|  | ||||
| def striptags(content): | ||||
| 	result = "" | ||||
| 	try: | ||||
| 		result = ''.join(xml.etree.ElementTree.fromstring(content).itertext()) | ||||
| 	except: | ||||
| 		#TODO: test<br>test2 will make it testtest2 not test test2 | ||||
| 		result = re.sub('<[^>]*>', '', content) | ||||
| 	 | ||||
| 	return result | ||||
| 	 | ||||
|  | ||||
| def strip_irrelevant(content): | ||||
| 	result = content.replace("\n", " ").replace("\t", " ") | ||||
| 	result = re.sub(' +', ' ', result) | ||||
| 	return result; | ||||
|  | ||||
|  | ||||
|  | ||||
| def process_pdf(path): | ||||
| 	result = list() | ||||
| 	args=["pdftotext", path, "-"] | ||||
| 	stdout,stderr =  subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() | ||||
| 	content = strip_irrelevant(stdout.decode('utf-8')) | ||||
| 	#it is faster to do it like this than to call pdfottext for each page | ||||
| 	splitted = content.split("\f") | ||||
| 	count=1 | ||||
| 	for page in splitted:  | ||||
| 		data = pagedata() | ||||
| 		data.page = count | ||||
| 		data.content = page | ||||
| 		result.append(data) | ||||
| 		count += 1 | ||||
| 	 | ||||
| 	#TODO: current hack, so we can fts search several words over the whole document | ||||
| 	#this of course uses more space, but in the end that's not a big problem | ||||
| 	#Nevertheless, this remains a hack | ||||
| 	everything = pagedata() | ||||
| 	everything.page = 0 | ||||
| 	everything.content = content.replace("\f", "") | ||||
| 	result.append(everything) | ||||
| 	return result | ||||
| 	 | ||||
| def process_odt(path): | ||||
| 	fd = zipfile.ZipFile(path) | ||||
| 	content = fd.read("content.xml").decode("utf-8") | ||||
| 	fd.close() | ||||
| 	return singlepagelist(striptags(content)) | ||||
| 	 | ||||
| def process_ods(path): | ||||
| 	return process_odt(path) | ||||
| 	 | ||||
| def readtext(path): | ||||
| 	fd = open(path, "rb") | ||||
| 	content = fd.read() | ||||
| 	fd.close() | ||||
|  | ||||
| 	result="" | ||||
| 	try: | ||||
| 		result = str(content.decode("utf-8")) | ||||
| 	except: | ||||
| 		try: | ||||
| 			encoding = chardet.detect(content)["encoding"]; | ||||
| 			if encoding == None: | ||||
| 				result = "" | ||||
| 			else: | ||||
| 				result = str(content.decode(encoding)) | ||||
| 		except: | ||||
| 			print("FAILED DECODING: " + path) | ||||
| 			result = "" | ||||
| 	return result | ||||
| 	 | ||||
| def process_striptags(path): | ||||
| 	content = readtext(path) | ||||
| 	return singlepagelist(striptags(content)) | ||||
| 	 | ||||
| def process_text(path): | ||||
| 	return singlepagelist(readtext(path)) | ||||
| 	 | ||||
| def process_nothing(path): | ||||
| 	return list() | ||||
| 	 | ||||
| def exists(abspath, mtime): | ||||
| 	cursor = dbcon.cursor() | ||||
| 	cursor.execute("SELECT 1 FROM file WHERE path = ? AND mtime = ?" , (abspath, mtime)) | ||||
| 	result = cursor.fetchone() | ||||
| 	if result != None and result[0] == 1: | ||||
| 		return True | ||||
| 	return False | ||||
|  | ||||
| def insert(path): | ||||
| 	print("processing", path) | ||||
| 	abspath=os.path.abspath(path) | ||||
| 	mtime = int(os.stat(abspath).st_mtime) | ||||
| 	 | ||||
| 	if exists(abspath, mtime): | ||||
|             print("Leaving alone " + abspath + " because it wasn't changed") | ||||
|             return | ||||
| 	basename=os.path.basename(abspath) | ||||
| 	ext = os.path.splitext(abspath)[1] | ||||
| 	 | ||||
| 	content="" | ||||
|  | ||||
| 	processor=process_nothing | ||||
| 	if ext in preprocess: | ||||
| 		processor=preprocess[ext] | ||||
| 	pagedatalist = processor(abspath) | ||||
|  | ||||
| 	#TODO: assumes sqlitehas been built with thread safety (and it is the default) | ||||
| 	cursor = dbcon.cursor() | ||||
| 	cursor.execute("BEGIN TRANSACTION") | ||||
| 	cursor.execute("DELETE FROM file WHERE path = ?", (abspath,)) | ||||
| 	cursor.execute("INSERT INTO file(path, mtime) VALUES(?, ?) ", (abspath, mtime)) | ||||
| 	fileid=cursor.lastrowid | ||||
| 	for pagedata in pagedatalist: | ||||
| 		cursor.execute("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", (fileid, pagedata.page, pagedata.content)) | ||||
| 	cursor.execute("COMMIT TRANSACTION") | ||||
|  | ||||
| preprocess={".pdf":process_pdf, ".odt":process_odt, ".ods":process_ods, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text,  | ||||
| 			".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text,  | ||||
| 			".py":process_text, '.md':process_text}	 | ||||
|  | ||||
| def init(): | ||||
| 	global dbcon | ||||
| 	dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) | ||||
|  | ||||
|  | ||||
| dbcon = None | ||||
| if __name__ == '__main__': | ||||
| 	with Pool(processes=4,initializer=init) as pool: | ||||
| 		 | ||||
| 		if len(sys.argv) < 2: | ||||
| 			pool.map(insert, (l.replace("\n", "") for l in sys.stdin)) | ||||
| 		else: | ||||
| 			pool.map(insert, sys.argv[1:]) | ||||
|  | ||||
| 	 | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
							
								
								
									
										9
									
								
								cli/addfileexception.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								cli/addfileexception.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| #ifndef ADDFILEEXCEPTION_H | ||||
| #define ADDFILEEXCEPTION_H | ||||
| #include <QException> | ||||
| #include <QString> | ||||
| class AddFileException : public QException | ||||
| { | ||||
|  | ||||
| }; | ||||
| #endif // ADDFILEEXCEPTION_H | ||||
							
								
								
									
										46
									
								
								cli/cli.pro
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								cli/cli.pro
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,46 @@ | ||||
| QT -= gui | ||||
| QT += sql concurrent | ||||
| CONFIG += c++11 console | ||||
| CONFIG -= app_bundle | ||||
|  | ||||
| # The following define makes your compiler emit warnings if you use | ||||
| # any feature of Qt which as been marked deprecated (the exact warnings | ||||
| # depend on your compiler). Please consult the documentation of the | ||||
| # deprecated API in order to know how to port your code away from it. | ||||
| DEFINES += QT_DEPRECATED_WARNINGS | ||||
|  | ||||
| # You can also make your code fail to compile if you use deprecated APIs. | ||||
| # In order to do so, uncomment the following line. | ||||
| # You can also select to disable deprecated APIs only up to a certain version of Qt. | ||||
| #DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0 | ||||
| LIBS += -luchardet -lpoppler-qt5 -lquazip | ||||
| SOURCES += \ | ||||
|         main.cpp \ | ||||
|     encodingdetector.cpp \ | ||||
|     processor.cpp \ | ||||
|     pdfprocessor.cpp \ | ||||
|     defaulttextprocessor.cpp \ | ||||
|     command.cpp \ | ||||
|     commandadd.cpp \ | ||||
|     tagstripperprocessor.cpp \ | ||||
|     nothingprocessor.cpp \ | ||||
|     odtprocessor.cpp \ | ||||
|     utils.cpp \ | ||||
|     odsprocessor.cpp \ | ||||
|     qssgeneralexception.cpp | ||||
|  | ||||
| HEADERS += \ | ||||
|     encodingdetector.h \ | ||||
|     processor.h \ | ||||
|     pagedata.h \ | ||||
|     pdfprocessor.h \ | ||||
|     defaulttextprocessor.h \ | ||||
|     command.h \ | ||||
|     commandadd.h \ | ||||
|     tagstripperprocessor.h \ | ||||
|     nothingprocessor.h \ | ||||
|     odtprocessor.h \ | ||||
|     utils.h \ | ||||
|     odsprocessor.h \ | ||||
|     qssgeneralexception.h | ||||
| INCLUDEPATH += /usr/include/poppler/qt5/ /usr/include/quazip5 | ||||
							
								
								
									
										37
									
								
								cli/command.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								cli/command.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,37 @@ | ||||
| #include <QFile> | ||||
| #include <QThread> | ||||
| #include <QDebug> | ||||
| #include "command.h" | ||||
| #include "qssgeneralexception.h" | ||||
|  | ||||
| bool Command::fileExistsInDatabase(QSqlDatabase &db, QString path, qint64 mtime) | ||||
| { | ||||
|     auto query = QSqlQuery("SELECT 1 FROM file WHERE path = ? and mtime = ?", db); | ||||
|     query.addBindValue(path); | ||||
|     query.addBindValue(mtime); | ||||
|     if(!query.exec()) | ||||
|     {        throw QSSGeneralException("Error while trying to query for file existance"); | ||||
|     } | ||||
|     if(!query.next()) | ||||
|     { | ||||
|         return false; | ||||
|     } | ||||
|     return query.value(0).toBool(); | ||||
| } | ||||
|  | ||||
| QSqlDatabase Command::dbConnection() | ||||
| { | ||||
|     if(dbStore.hasLocalData()) | ||||
|     { | ||||
|         return dbStore.localData(); | ||||
|     } | ||||
|     QSqlDatabase db = QSqlDatabase::addDatabase("QSQLITE", "QSS" + QString::number((quint64)QThread::currentThread(), 16)); | ||||
|     db.setDatabaseName(this->dbConnectionString); | ||||
|     if(!db.open()) | ||||
|     { | ||||
|         qDebug() << "Failed to open the database: " << this->dbConnectionString; | ||||
|     } | ||||
|     dbStore.setLocalData(db); | ||||
|     return db; | ||||
| } | ||||
|  | ||||
							
								
								
									
										26
									
								
								cli/command.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								cli/command.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| #ifndef COMMAND_H | ||||
| #define COMMAND_H | ||||
| #include <QStringList> | ||||
| #include <QSqlDatabase> | ||||
| #include <QSqlQuery> | ||||
| #include <QThreadStorage> | ||||
| #include <QVariant> | ||||
| class Command | ||||
| { | ||||
| protected: | ||||
|     bool fileExistsInDatabase(QSqlDatabase &db, QString path, qint64 mtime); | ||||
|     QByteArray readFile(QString path) const; | ||||
|     QString dbConnectionString; | ||||
|     QThreadStorage<QSqlDatabase> dbStore; | ||||
| public: | ||||
|     Command(QString dbConnectionString) | ||||
|     { | ||||
|         this->dbConnectionString = dbConnectionString; | ||||
|     } | ||||
|  | ||||
|     QSqlDatabase dbConnection(); | ||||
|     virtual int handle(QStringList arguments) = 0; | ||||
|     virtual ~Command() { }; | ||||
| }; | ||||
|  | ||||
| #endif // COMMAND_H | ||||
							
								
								
									
										214
									
								
								cli/commandadd.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										214
									
								
								cli/commandadd.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,214 @@ | ||||
| #include <QFileInfo> | ||||
| #include <QDebug> | ||||
| #include <QSqlQuery> | ||||
| #include <QSqlError> | ||||
| #include <QDateTime> | ||||
| #include <QMap> | ||||
| #include <QTextStream> | ||||
| #include <QException> | ||||
| #include <QCommandLineParser> | ||||
| #include <QMutex> | ||||
| #include <QMutexLocker> | ||||
| #include <QtConcurrent/QtConcurrentMap> | ||||
| #include "processor.h" | ||||
| #include "pdfprocessor.h" | ||||
| #include "commandadd.h" | ||||
| #include "defaulttextprocessor.h" | ||||
| #include "tagstripperprocessor.h" | ||||
| #include "nothingprocessor.h" | ||||
| #include "odtprocessor.h" | ||||
| #include "odsprocessor.h" | ||||
| #include "utils.h" | ||||
| static DefaultTextProcessor *defaultTextProcessor = new DefaultTextProcessor(); | ||||
| static TagStripperProcessor *tagStripperProcessor = new TagStripperProcessor(); | ||||
| static NothingProcessor *nothingProcessor = new NothingProcessor(); | ||||
| static OdtProcessor *odtProcessor = new OdtProcessor(); | ||||
| static OdsProcessor *odsProcessor = new OdsProcessor(); | ||||
|  | ||||
| static QMap<QString, Processor*> processors{ | ||||
|     {"pdf", new PdfProcessor() }, | ||||
|     {"txt", defaultTextProcessor}, | ||||
|     {"md", defaultTextProcessor}, | ||||
|     {"py", defaultTextProcessor}, | ||||
|     {"xml", nothingProcessor}, | ||||
|     {"html", tagStripperProcessor}, | ||||
|     {"java", defaultTextProcessor}, | ||||
|     {"js", defaultTextProcessor}, | ||||
|     {"cpp", defaultTextProcessor}, | ||||
|     {"c", defaultTextProcessor}, | ||||
|     {"sql", defaultTextProcessor}, | ||||
|     {"odt", odtProcessor}, | ||||
|     {"ods", odsProcessor} | ||||
| }; | ||||
|  | ||||
|  | ||||
|  | ||||
| AddFileResult CommandAdd::addFile(QString path) | ||||
| { | ||||
|     QFileInfo info(path); | ||||
|     QString absPath = info.absoluteFilePath(); | ||||
|     auto mtime = info.lastModified().toSecsSinceEpoch(); | ||||
|     QChar fileType = info.isDir() ? 'd' : 'f'; | ||||
|  | ||||
|     QSqlDatabase db = dbConnection(); | ||||
|     if(fileExistsInDatabase(db, absPath, mtime)) | ||||
|     { | ||||
|         return SKIPPED; | ||||
|     } | ||||
|  | ||||
|  | ||||
|  | ||||
|     Processor *processor = processors.value(info.suffix(), nothingProcessor); | ||||
|     QVector<PageData> pageData; | ||||
|     if(processor->PREFERED_DATA_SOURCE == FILEPATH) | ||||
|     { | ||||
|         pageData = processor->process(absPath); | ||||
|     } | ||||
|     else | ||||
|     { | ||||
|         pageData = processor->process(Utils::readFile(absPath)); | ||||
|     } | ||||
|  | ||||
|     if(pageData.isEmpty()) | ||||
|     { | ||||
|         qDebug() << "Could not get any content for " << absPath; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     //Workaround to "database is locked" error. Perhaps try WAL mode etc. | ||||
|     //QMutexLocker locker(&writeMutex); | ||||
|     if(!db.transaction()) | ||||
|     { | ||||
|         qDebug() << "Failed to open transaction for " << absPath << " : " << db.lastError(); | ||||
|         return DBFAIL; | ||||
|     } | ||||
|  | ||||
|     QSqlQuery delQuery("DELETE FROM file WHERE path = ?", db); | ||||
|     delQuery.addBindValue(absPath); | ||||
|     if(!delQuery.exec()) | ||||
|     { | ||||
|         qDebug() << "Failed DELETE query" <<  delQuery.lastError(); | ||||
|         db.rollback(); | ||||
|         return DBFAIL; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     QSqlQuery inserterQuery("INSERT INTO file(path, mtime, size, filetype) VALUES(?, ?, ?, ?)", db); | ||||
|     inserterQuery.addBindValue(absPath); | ||||
|     inserterQuery.addBindValue(mtime); | ||||
|     inserterQuery.addBindValue(info.size()); | ||||
|     inserterQuery.addBindValue(fileType); | ||||
|     if(!inserterQuery.exec()) | ||||
|     { | ||||
|         qDebug() << "Failed INSERT query" <<  inserterQuery.lastError(); | ||||
|         db.rollback(); | ||||
|         return DBFAIL; | ||||
|     } | ||||
|     int lastid = inserterQuery.lastInsertId().toInt(); | ||||
|     for(PageData &data : pageData) | ||||
|     { | ||||
|         QSqlQuery contentQuery("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", db); | ||||
|         contentQuery.addBindValue(lastid); | ||||
|         contentQuery.addBindValue(data.pagenumber); | ||||
|         contentQuery.addBindValue(data.content); | ||||
|         if(!contentQuery.exec()) | ||||
|         { | ||||
|             db.rollback(); | ||||
|             qDebug() << "Failed content insertion " << contentQuery.lastError(); | ||||
|             return DBFAIL; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if(!db.commit()) | ||||
|     { | ||||
|         db.rollback(); | ||||
|         qDebug() << "Failed to commit transaction for " << absPath <<  " : " << db.lastError(); | ||||
|         return DBFAIL; | ||||
|     } | ||||
|     return OK; | ||||
| } | ||||
|  | ||||
|  | ||||
| int CommandAdd::handle(QStringList arguments) | ||||
| { | ||||
|     QCommandLineParser parser; | ||||
|     parser.addOptions({ | ||||
|                           { { "c", "continue" }, "Continue adding files, don't exit on first error"}, | ||||
|                           { { "a", "all" }, "On error, no files should be added, even already processed ones" }, | ||||
|                           { { "v", "verbose" }, "Print skipped and added files" }, | ||||
|                           { { "n", "threads" }, "Number of threads to use.", "threads" } | ||||
|                       }); | ||||
|  | ||||
|     parser.addHelpOption(); | ||||
|     parser.addPositionalArgument("paths", "List of paths to process/add to the index", "[paths...]"); | ||||
|  | ||||
|     parser.process(arguments); | ||||
|     bool keepGoing = false; | ||||
|     bool verbose = false; | ||||
|     if(parser.isSet("continue")) | ||||
|     { | ||||
|         keepGoing = true; | ||||
|     } | ||||
|     if(parser.isSet("verbose")) | ||||
|     { | ||||
|         verbose = true; | ||||
|     } | ||||
|     if(parser.isSet("all")) | ||||
|     { | ||||
|         throw QSSGeneralException("To be implemented"); | ||||
|     } | ||||
|     if(parser.isSet("threads")) | ||||
|     { | ||||
|         QString threadsCount = parser.value("threads"); | ||||
|         QThreadPool::globalInstance()->setMaxThreadCount(threadsCount.toInt()); | ||||
|     } | ||||
|  | ||||
|     QStringList files = parser.positionalArguments(); | ||||
|  | ||||
|     if(files.length() == 0) | ||||
|     { | ||||
|         QTextStream stream(stdin); | ||||
|  | ||||
|         while(!stream.atEnd()) | ||||
|         { | ||||
|             QString path = stream.readLine(); | ||||
|             files.append(path); | ||||
|          } | ||||
|     } | ||||
|  | ||||
|     bool terminate = false; | ||||
|     QtConcurrent::blockingMap(files, [&](QString &path) { | ||||
|         if(terminate) | ||||
|         { | ||||
|             return; | ||||
|         } | ||||
|         if(verbose) | ||||
|         { | ||||
|             qDebug() << "Processing " << path; | ||||
|         } | ||||
|         auto result = addFile(path); | ||||
|         if(result == DBFAIL) | ||||
|         { | ||||
|             qDebug() << "Failed to add " << path; | ||||
|             if(!keepGoing) | ||||
|             { | ||||
|                 terminate = true; | ||||
|             } | ||||
|         } | ||||
|         if(verbose) | ||||
|         { | ||||
|             if(result == SKIPPED) | ||||
|             { | ||||
|                 qDebug() << "SKIPPED" << path << "as it already exists in the database"; | ||||
|             } | ||||
|             else | ||||
|             { | ||||
|                 qDebug() << "Added" << path; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|  | ||||
|     }); | ||||
|  | ||||
|     return 0; | ||||
| } | ||||
							
								
								
									
										23
									
								
								cli/commandadd.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								cli/commandadd.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,23 @@ | ||||
| #ifndef COMMANDADD_H | ||||
| #define COMMANDADD_H | ||||
| #include <QMutex> | ||||
| #include "command.h" | ||||
| enum AddFileResult | ||||
| { | ||||
|     OK, | ||||
|     SKIPPED, | ||||
|     DBFAIL | ||||
| }; | ||||
|  | ||||
| class CommandAdd : public Command | ||||
| { | ||||
| private: | ||||
|     AddFileResult addFile(QString path); | ||||
|     QMutex writeMutex; | ||||
| public: | ||||
|     using Command::Command; | ||||
|  | ||||
|     int handle(QStringList arguments) override; | ||||
| }; | ||||
|  | ||||
| #endif // COMMANDADD_H | ||||
							
								
								
									
										31
									
								
								cli/defaulttextprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								cli/defaulttextprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| #include <QFile> | ||||
| #include <QDataStream> | ||||
| #include <QTextCodec> | ||||
| #include <QDebug> | ||||
| #include "defaulttextprocessor.h" | ||||
|  | ||||
| DefaultTextProcessor::DefaultTextProcessor() | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
| QString DefaultTextProcessor::processText(const QByteArray &data) const | ||||
| { | ||||
|     QString encoding = encodingDetector.detectEncoding(data); | ||||
|     if(!encoding.isEmpty()) | ||||
|     { | ||||
|         QTextCodec *codec = QTextCodec::codecForName(encoding.toUtf8()); | ||||
|         if(codec != nullptr) | ||||
|         { | ||||
|             return codec->toUnicode(data); | ||||
|         } | ||||
|         qWarning() << "No codec found for " << encoding; | ||||
|         return QString(data); | ||||
|     } | ||||
|     return { }; | ||||
| } | ||||
|  | ||||
| QVector<PageData> DefaultTextProcessor::process(const QByteArray &data) const | ||||
| { | ||||
|     return {{0, processText(data)}}; | ||||
| } | ||||
							
								
								
									
										16
									
								
								cli/defaulttextprocessor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								cli/defaulttextprocessor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| #ifndef DEFAULTTEXTPROCESSOR_H | ||||
| #define DEFAULTTEXTPROCESSOR_H | ||||
|  | ||||
| #include "processor.h" | ||||
| #include "encodingdetector.h" | ||||
| class DefaultTextProcessor : public Processor | ||||
| { | ||||
| protected: | ||||
|     EncodingDetector encodingDetector; | ||||
| public: | ||||
|     DefaultTextProcessor(); | ||||
|     QString processText(const QByteArray &data) const; | ||||
|     QVector<PageData> process(const QByteArray &data) const override; | ||||
| }; | ||||
|  | ||||
| #endif // DEFAULTTEXTPROCESSOR_H | ||||
							
								
								
									
										50
									
								
								cli/encodingdetector.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								cli/encodingdetector.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| #include <QDataStream> | ||||
| #include "encodingdetector.h" | ||||
| #include <qssgeneralexception.h> | ||||
| EncodingDetector::EncodingDetector() | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
|  | ||||
| QString EncodingDetector::detectEncoding(const QByteArray &data) const | ||||
| { | ||||
|     uchardet_t detector = uchardet_new(); | ||||
|     if(uchardet_handle_data(detector, data.data(), data.size()) != 0 ) | ||||
|     { | ||||
|       uchardet_delete(detector); | ||||
|       throw QSSGeneralException("Decoder failed"); | ||||
|     } | ||||
|     uchardet_data_end(detector); | ||||
|     QString encoding = uchardet_get_charset(detector); | ||||
|     uchardet_delete(detector); | ||||
|     return encoding; | ||||
|  | ||||
| } | ||||
| QString EncodingDetector::detectEncoding(QDataStream &s) const | ||||
| { | ||||
|     uchardet_t detector = uchardet_new(); | ||||
|  | ||||
|     char buffer[4096]; | ||||
|     int n; | ||||
|     while((n = s.readRawData(buffer, sizeof(buffer))) > 0) | ||||
|     { | ||||
|         if(uchardet_handle_data(detector, buffer, n) != 0) | ||||
|         { | ||||
|             uchardet_delete(detector); | ||||
|  | ||||
|             throw QSSGeneralException("Decoder failed"); | ||||
|         } | ||||
|     } | ||||
|     if(n == -1) | ||||
|     { | ||||
|         uchardet_delete(detector); | ||||
|         throw QSSGeneralException("Read failed"); | ||||
|     } | ||||
|     uchardet_data_end(detector); | ||||
|     QString encoding = uchardet_get_charset(detector); | ||||
|     uchardet_delete(detector); | ||||
|     return encoding; | ||||
| } | ||||
|  | ||||
|  | ||||
							
								
								
									
										14
									
								
								cli/encodingdetector.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								cli/encodingdetector.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| #ifndef ENCODINGDETECTOR_H | ||||
| #define ENCODINGDETECTOR_H | ||||
| #include <QString> | ||||
| #include <uchardet/uchardet.h> | ||||
| class EncodingDetector | ||||
| { | ||||
|  | ||||
| public: | ||||
|     EncodingDetector(); | ||||
|     QString detectEncoding(const QByteArray &data) const; | ||||
|     QString detectEncoding(QDataStream &s) const; | ||||
| }; | ||||
|  | ||||
| #endif // ENCODINGDETECTOR_H | ||||
							
								
								
									
										76
									
								
								cli/main.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								cli/main.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,76 @@ | ||||
| #include <QCoreApplication> | ||||
| #include <QFile> | ||||
| #include <QFileInfo> | ||||
| #include <QDateTime> | ||||
| #include <QDataStream> | ||||
| #include <QDebug> | ||||
| #include <QProcessEnvironment> | ||||
| #include <QSqlDatabase> | ||||
| #include <QSqlQuery> | ||||
| #include <QSqlError> | ||||
| #include <QMap> | ||||
| #include <QDebug> | ||||
| #include <functional> | ||||
| #include <exception> | ||||
| #include "encodingdetector.h" | ||||
| #include "pdfprocessor.h" | ||||
| #include "defaulttextprocessor.h" | ||||
| #include "command.h" | ||||
| #include "commandadd.h" | ||||
| void printUsage(QString argv0) | ||||
| { | ||||
|     qInfo() << "Usage: " << argv0 << "command"; | ||||
| } | ||||
|  | ||||
| Command *commandFromName(QString name, QString connectionstring) | ||||
| { | ||||
|     if(name == "add") | ||||
|     { | ||||
|         return new CommandAdd(connectionstring); | ||||
|     } | ||||
|     if(name == "delete") | ||||
|     { | ||||
|  | ||||
|     } | ||||
|     if(name == "update") | ||||
|     { | ||||
|  | ||||
|     } | ||||
|     if(name == "search") | ||||
|     { | ||||
|  | ||||
|     } | ||||
|     return nullptr; | ||||
| } | ||||
|  | ||||
| int main(int argc, char *argv[]) | ||||
| { | ||||
|     QCoreApplication app(argc, argv); | ||||
|  | ||||
|     QStringList args = app.arguments(); | ||||
|     QString argv0 = args.takeFirst(); | ||||
|     if(args.length() < 1) | ||||
|     { | ||||
|         printUsage(argv0); | ||||
|         exit(1); | ||||
|     } | ||||
|  | ||||
|     QString commandName = args.first(); | ||||
|     Command *cmd = commandFromName(commandName, QProcessEnvironment::systemEnvironment().value("QSS_PATH")); | ||||
|     if(cmd != nullptr) | ||||
|     { | ||||
|         try | ||||
|         { | ||||
|             return cmd->handle(args); | ||||
|         } | ||||
|         catch(const QSSGeneralException &e) | ||||
|         { | ||||
|             qDebug() << "Exception caught, message: " << e.message; | ||||
|         } | ||||
|     } | ||||
|     else | ||||
|     { | ||||
|         qDebug() << "Unknown command " << commandName; | ||||
|     } | ||||
|     return 1; | ||||
| } | ||||
							
								
								
									
										6
									
								
								cli/nothingprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								cli/nothingprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| #include "nothingprocessor.h" | ||||
|  | ||||
| NothingProcessor::NothingProcessor() | ||||
| { | ||||
|  | ||||
| } | ||||
							
								
								
									
										19
									
								
								cli/nothingprocessor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								cli/nothingprocessor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | ||||
| #ifndef NOTHINGPROCESSOR_H | ||||
| #define NOTHINGPROCESSOR_H | ||||
| #include <QVector> | ||||
| #include "processor.h" | ||||
| #include "pagedata.h" | ||||
|  | ||||
| class NothingProcessor : public Processor | ||||
| { | ||||
| public: | ||||
|     NothingProcessor(); | ||||
|  | ||||
| public: | ||||
|     QVector<PageData> process(const QByteArray &data) const override { | ||||
|         return { }; | ||||
|     } | ||||
| }; | ||||
|  | ||||
|  | ||||
| #endif // NOTHINGPROCESSOR_H | ||||
							
								
								
									
										6
									
								
								cli/odsprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								cli/odsprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,6 @@ | ||||
| #include "odsprocessor.h" | ||||
|  | ||||
| OdsProcessor::OdsProcessor() | ||||
| { | ||||
|  | ||||
| } | ||||
							
								
								
									
										10
									
								
								cli/odsprocessor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								cli/odsprocessor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | ||||
| #ifndef ODSPROCESSOR_H | ||||
| #define ODSPROCESSOR_H | ||||
| #include "odtprocessor.h" | ||||
| class OdsProcessor : public OdtProcessor | ||||
| { | ||||
| public: | ||||
|     OdsProcessor(); | ||||
| }; | ||||
|  | ||||
| #endif // ODSPROCESSOR_H | ||||
							
								
								
									
										27
									
								
								cli/odtprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								cli/odtprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| #include <quazip5/quazip.h> | ||||
| #include <quazip5/quazipfile.h> | ||||
| #include "odtprocessor.h" | ||||
| #include "tagstripperprocessor.h" | ||||
|  | ||||
|  | ||||
| QVector<PageData> OdtProcessor::process(const QByteArray &data) const | ||||
| { | ||||
|     throw QSSGeneralException("Not implemented yet"); | ||||
| } | ||||
|  | ||||
| QVector<PageData> OdtProcessor::process(QString path) const | ||||
| { | ||||
|     QuaZipFile zipFile(path); | ||||
|     zipFile.setFileName("content.xml"); | ||||
|     if(!zipFile.open(QIODevice::ReadOnly)) | ||||
|     { | ||||
|        throw QSSGeneralException("Error while opening file " + path); | ||||
|     } | ||||
|     QByteArray entireContent = zipFile.readAll(); | ||||
|     if(entireContent.isEmpty()) | ||||
|     { | ||||
|         throw QSSGeneralException("Error while reading content.xml of " + path); | ||||
|     } | ||||
|     TagStripperProcessor tsp; | ||||
|     return tsp.process(entireContent); | ||||
| } | ||||
							
								
								
									
										18
									
								
								cli/odtprocessor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								cli/odtprocessor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,18 @@ | ||||
| #ifndef ODTPROCESSOR_H | ||||
| #define ODTPROCESSOR_H | ||||
| #include "processor.h" | ||||
| class OdtProcessor : public Processor | ||||
| { | ||||
| public: | ||||
|     OdtProcessor() | ||||
|     { | ||||
|         this->PREFERED_DATA_SOURCE = FILEPATH; | ||||
|     } | ||||
|     QVector<PageData> process(const QByteArray &data) const override; | ||||
|  | ||||
|     QVector<PageData> process(QString path) const override; | ||||
|  | ||||
|  | ||||
| }; | ||||
|  | ||||
| #endif // ODTPROCESSOR_H | ||||
							
								
								
									
										21
									
								
								cli/pagedata.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								cli/pagedata.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| #ifndef PAGEDATA_H | ||||
| #define PAGEDATA_H | ||||
| #include <QString> | ||||
| class PageData | ||||
| { | ||||
| public: | ||||
|  | ||||
|     unsigned int pagenumber = 0; | ||||
|     QString content; | ||||
|  | ||||
|     PageData() | ||||
|     { | ||||
|  | ||||
|     } | ||||
|     PageData(unsigned int pagenumber, QString content) | ||||
|     { | ||||
|         this->pagenumber = pagenumber; | ||||
|         this->content = content; | ||||
|     } | ||||
| }; | ||||
| #endif // PAGEDATA_H | ||||
							
								
								
									
										34
									
								
								cli/pdfprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								cli/pdfprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | ||||
| #include <QScopedPointer> | ||||
| #include <poppler-qt5.h> | ||||
| #include "pdfprocessor.h" | ||||
| PdfProcessor::PdfProcessor() | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
|  | ||||
| QVector<PageData> PdfProcessor::process(const QByteArray &data) const | ||||
| { | ||||
|     QVector<PageData> result; | ||||
|     QScopedPointer<Poppler::Document> doc(Poppler::Document::loadFromData(data)); | ||||
|     if(doc.isNull()) | ||||
|     { | ||||
|         return { }; | ||||
|     } | ||||
|     QRectF entirePage; | ||||
|  | ||||
|     auto pagecount = doc->numPages(); | ||||
|     QString entire; | ||||
|     entire.reserve(data.size()); //TODO too much | ||||
|     for(auto i = 0; i < pagecount; i++ ) | ||||
|     { | ||||
|         QString text =doc->page(i)->text(entirePage); | ||||
|         result.append({static_cast<unsigned int>(i+1),text }); | ||||
|         /*TODO: hack, so we can fts search several words over the whole document, not just pages. | ||||
|          * this of course uses more space and should be solved differently. | ||||
|         */ | ||||
|         entire += text; | ||||
|     } | ||||
|     result.append({0, entire}); | ||||
|     return result; | ||||
| } | ||||
							
								
								
									
										13
									
								
								cli/pdfprocessor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								cli/pdfprocessor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,13 @@ | ||||
| #ifndef PDFPROCESSOR_H | ||||
| #define PDFPROCESSOR_H | ||||
| #include "processor.h" | ||||
| class PdfProcessor : public Processor | ||||
| { | ||||
| public: | ||||
|     PdfProcessor(); | ||||
|  | ||||
| public: | ||||
|     QVector<PageData> process(const QByteArray &data) const override; | ||||
| }; | ||||
|  | ||||
| #endif // PDFPROCESSOR_H | ||||
							
								
								
									
										7
									
								
								cli/processor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								cli/processor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,7 @@ | ||||
| #include "processor.h" | ||||
|  | ||||
| Processor::Processor() | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
							
								
								
									
										32
									
								
								cli/processor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								cli/processor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| #ifndef PROCESSOR_H | ||||
| #define PROCESSOR_H | ||||
| #include <QVector> | ||||
| #include <QFile> | ||||
| #include "pagedata.h" | ||||
| #include "utils.h" | ||||
| enum DataSource | ||||
| { | ||||
|     FILEPATH, | ||||
|     ARRAY | ||||
| }; | ||||
|  | ||||
| class Processor | ||||
| { | ||||
| public: | ||||
|     /* Indicates the data source the processor performs best with. For example, | ||||
|      * you do not want to read the entire of a compressed archive just to get the content of | ||||
|      * a single file */ | ||||
|     DataSource PREFERED_DATA_SOURCE = ARRAY; | ||||
|     Processor(); | ||||
|     virtual QVector<PageData> process(const QByteArray &data) const = 0; | ||||
|     virtual QVector<PageData> process(QString path) const | ||||
|     { | ||||
|         return process(Utils::readFile(path)); | ||||
|  | ||||
|     } | ||||
|  | ||||
|  | ||||
|     virtual ~Processor() { } | ||||
| }; | ||||
|  | ||||
| #endif // PROCESSOR_H | ||||
							
								
								
									
										2
									
								
								cli/qssgeneralexception.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								cli/qssgeneralexception.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| #include "qssgeneralexception.h" | ||||
|  | ||||
							
								
								
									
										15
									
								
								cli/qssgeneralexception.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								cli/qssgeneralexception.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| #ifndef QSSGENERALEXCEPTION_H | ||||
| #define QSSGENERALEXCEPTION_H | ||||
|  | ||||
| #include <QException> | ||||
|  | ||||
| class QSSGeneralException : public QException | ||||
| { | ||||
| public: | ||||
|     QString message; | ||||
|     QSSGeneralException(QString message) { this->message = message; }; | ||||
|     void raise() const override { throw *this; } | ||||
|     QSSGeneralException *clone() const override { return new QSSGeneralException(*this); } | ||||
| }; | ||||
|  | ||||
| #endif // QSSGENERALEXCEPTION_H | ||||
							
								
								
									
										15
									
								
								cli/tagstripperprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								cli/tagstripperprocessor.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| #include "tagstripperprocessor.h" | ||||
|  | ||||
| TagStripperProcessor::TagStripperProcessor() | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
| QVector<PageData> TagStripperProcessor::process(const QByteArray &data) const | ||||
| { | ||||
|     auto result = DefaultTextProcessor::process(data); | ||||
|     //TODO: does not work properly with <br> and does not deal with entities... | ||||
|  | ||||
|     result[0].content.remove(QRegExp("<[^>]*>")); | ||||
|     return result; | ||||
| } | ||||
							
								
								
									
										14
									
								
								cli/tagstripperprocessor.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								cli/tagstripperprocessor.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| #ifndef XMLSTRIPPERPROCESSOR_H | ||||
| #define XMLSTRIPPERPROCESSOR_H | ||||
| #include "defaulttextprocessor.h" | ||||
|  | ||||
| class TagStripperProcessor : public DefaultTextProcessor | ||||
| { | ||||
| public: | ||||
|     TagStripperProcessor(); | ||||
|  | ||||
| public: | ||||
|     QVector<PageData> process(const QByteArray &data) const override; | ||||
| }; | ||||
|  | ||||
| #endif // XMLSTRIPPERPROCESSOR_H | ||||
							
								
								
									
										21
									
								
								cli/utils.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								cli/utils.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,21 @@ | ||||
| #include <QDebug> | ||||
| #include "utils.h" | ||||
| Utils::Utils() | ||||
| { | ||||
|  | ||||
| } | ||||
|  | ||||
| QByteArray Utils::readFile(QString path) | ||||
| { | ||||
|     QFile file(path); | ||||
|     if(!file.open(QIODevice::ReadOnly)) | ||||
|     { | ||||
|         throw QSSGeneralException("Failed to open file: " + path); | ||||
|     } | ||||
|     QByteArray data = file.readAll(); | ||||
|     if(data.isEmpty() && file.error() != QFileDevice::FileError::NoError) | ||||
|     { | ||||
|         throw QSSGeneralException("Error reading file: " + path + ", Error: " + file.error()); | ||||
|     } | ||||
|     return data; | ||||
| } | ||||
							
								
								
									
										15
									
								
								cli/utils.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								cli/utils.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,15 @@ | ||||
| #ifndef UTILS_H | ||||
| #define UTILS_H | ||||
| #include <QFile> | ||||
| #include <QString> | ||||
| #include <QByteArray> | ||||
| #include "qssgeneralexception.h" | ||||
|  | ||||
| class Utils | ||||
| { | ||||
| public: | ||||
|     Utils(); | ||||
|     static QByteArray readFile(QString path); | ||||
| }; | ||||
|  | ||||
| #endif // UTILS_H | ||||
| @@ -1,5 +0,0 @@ | ||||
| import os | ||||
| DBPATH=os.getenv("QSS_PATH") | ||||
| if DBPATH == None or DBPATH == "": | ||||
|     print("MIssing env var") | ||||
|     exit(1) | ||||
							
								
								
									
										21
									
								
								delindex
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								delindex
									
									
									
									
									
								
							| @@ -1,21 +0,0 @@ | ||||
| #!/bin/sh | ||||
| TEMPFILE=$(mktemp) | ||||
| DBFILE="$QSS_PATH" | ||||
| function todelete() | ||||
| { | ||||
| 	echo "DELETE FROM file WHERE path = '$1';" >> /"$TEMPFILE" | ||||
| } | ||||
|  | ||||
| echo "BEGIN TRANSACTION;" >> /"$TEMPFILE" | ||||
|  | ||||
| sqlite3 "$DBFILE" "SELECT path FROM file;"| while read line ; do | ||||
| [ -e "$line" ] || todelete "$line" | ||||
| done | ||||
|  | ||||
| echo "COMMIT TRANSACTION;" >> /"$TEMPFILE" | ||||
|  | ||||
| sqlite3 "$DBFILE" < /"$TEMPFILE" | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
							
								
								
									
										21
									
								
								searchindex
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								searchindex
									
									
									
									
									
								
							| @@ -1,21 +0,0 @@ | ||||
| #!/usr/bin/python3 | ||||
| import sqlite3 | ||||
| import sys | ||||
| import config | ||||
|  | ||||
| dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) | ||||
| cursor = dbcon.cursor() | ||||
|  | ||||
| if len(sys.argv) < 2: | ||||
| 	print("Error: Missing search") | ||||
| 	 | ||||
| search=sys.argv[1:] | ||||
| #TODO: machien parseable | ||||
| for row in cursor.execute("SELECT file.path, content.page FROM file INNER JOIN content ON file.id = content.fileid INNER JOIN content_fts ON content.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)): | ||||
| 	print("File:", row[0], "Page: ", row[1]) | ||||
| dbcon.close() | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
		Reference in New Issue
	
	Block a user