From 3e23021621004c5e75818ce37017746197230120 Mon Sep 17 00:00:00 2001 From: Albert S Date: Sat, 6 Apr 2019 17:16:42 +0200 Subject: [PATCH] Begin a C++ cli and remove the Python scripts --- addindex | 167 ----------------------------- cli/addfileexception.h | 8 ++ cli/cli.pro | 46 ++++++++ cli/command.cpp | 38 +++++++ cli/command.h | 27 +++++ cli/commandadd.cpp | 196 +++++++++++++++++++++++++++++++++++ cli/commandadd.h | 24 +++++ cli/defaulttextprocessor.cpp | 30 ++++++ cli/defaulttextprocessor.h | 17 +++ cli/encodingdetector.cpp | 45 ++++++++ cli/encodingdetector.h | 14 +++ cli/main.cpp | 73 +++++++++++++ cli/nothingprocessor.cpp | 5 + cli/nothingprocessor.h | 19 ++++ cli/odsprocessor.cpp | 5 + cli/odsprocessor.h | 10 ++ cli/odtprocessor.cpp | 26 +++++ cli/odtprocessor.h | 16 +++ cli/pagedata.h | 19 ++++ cli/pdfprocessor.cpp | 32 ++++++ cli/pdfprocessor.h | 13 +++ cli/processor.cpp | 5 + cli/processor.h | 32 ++++++ cli/qssgeneralexception.cpp | 1 + cli/qssgeneralexception.h | 24 +++++ cli/tagstripperprocessor.cpp | 14 +++ cli/tagstripperprocessor.h | 14 +++ cli/utils.cpp | 20 ++++ cli/utils.h | 15 +++ config.py | 5 - delindex | 21 ---- searchindex | 21 ---- 32 files changed, 788 insertions(+), 214 deletions(-) delete mode 100755 addindex create mode 100644 cli/addfileexception.h create mode 100644 cli/cli.pro create mode 100644 cli/command.cpp create mode 100644 cli/command.h create mode 100644 cli/commandadd.cpp create mode 100644 cli/commandadd.h create mode 100644 cli/defaulttextprocessor.cpp create mode 100644 cli/defaulttextprocessor.h create mode 100644 cli/encodingdetector.cpp create mode 100644 cli/encodingdetector.h create mode 100644 cli/main.cpp create mode 100644 cli/nothingprocessor.cpp create mode 100644 cli/nothingprocessor.h create mode 100644 cli/odsprocessor.cpp create mode 100644 cli/odsprocessor.h create mode 100644 cli/odtprocessor.cpp create mode 100644 cli/odtprocessor.h create mode 100644 cli/pagedata.h create mode 100644 cli/pdfprocessor.cpp create mode 100644 cli/pdfprocessor.h create mode 100644 cli/processor.cpp create mode 100644 cli/processor.h create mode 100644 cli/qssgeneralexception.cpp create mode 100644 cli/qssgeneralexception.h create mode 100644 cli/tagstripperprocessor.cpp create mode 100644 cli/tagstripperprocessor.h create mode 100644 cli/utils.cpp create mode 100644 cli/utils.h delete mode 100644 config.py delete mode 100755 delindex delete mode 100755 searchindex diff --git a/addindex b/addindex deleted file mode 100755 index c1b8e60..0000000 --- a/addindex +++ /dev/null @@ -1,167 +0,0 @@ -#!/usr/bin/python3 -import sqlite3 -import os.path -import sys -import subprocess -import zipfile -import xml.etree.ElementTree -import re -import chardet -import config -from multiprocessing import Pool - - - -class pagedata: - page = 0 - content = "" - - -def singlepagelist(content): - result = pagedata() - result.content = content - result.page = 0 - l = list(); - l.append(result) - return l - -def striptags(content): - result = "" - try: - result = ''.join(xml.etree.ElementTree.fromstring(content).itertext()) - except: - #TODO: test
test2 will make it testtest2 not test test2 - result = re.sub('<[^>]*>', '', content) - - return result - - -def strip_irrelevant(content): - result = content.replace("\n", " ").replace("\t", " ") - result = re.sub(' +', ' ', result) - return result; - - - -def process_pdf(path): - result = list() - args=["pdftotext", path, "-"] - stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() - content = strip_irrelevant(stdout.decode('utf-8')) - #it is faster to do it like this than to call pdfottext for each page - splitted = content.split("\f") - count=1 - for page in splitted: - data = pagedata() - data.page = count - data.content = page - result.append(data) - count += 1 - - #TODO: current hack, so we can fts search several words over the whole document - #this of course uses more space, but in the end that's not a big problem - #Nevertheless, this remains a hack - everything = pagedata() - everything.page = 0 - everything.content = content.replace("\f", "") - result.append(everything) - return result - -def process_odt(path): - fd = zipfile.ZipFile(path) - content = fd.read("content.xml").decode("utf-8") - fd.close() - return singlepagelist(striptags(content)) - -def process_ods(path): - return process_odt(path) - -def readtext(path): - fd = open(path, "rb") - content = fd.read() - fd.close() - - result="" - try: - result = str(content.decode("utf-8")) - except: - try: - encoding = chardet.detect(content)["encoding"]; - if encoding == None: - result = "" - else: - result = str(content.decode(encoding)) - except: - print("FAILED DECODING: " + path) - result = "" - return result - -def process_striptags(path): - content = readtext(path) - return singlepagelist(striptags(content)) - -def process_text(path): - return singlepagelist(readtext(path)) - -def process_nothing(path): - return list() - -def exists(abspath, mtime): - cursor = dbcon.cursor() - cursor.execute("SELECT 1 FROM file WHERE path = ? AND mtime = ?" , (abspath, mtime)) - result = cursor.fetchone() - if result != None and result[0] == 1: - return True - return False - -def insert(path): - print("processing", path) - abspath=os.path.abspath(path) - mtime = int(os.stat(abspath).st_mtime) - - if exists(abspath, mtime): - print("Leaving alone " + abspath + " because it wasn't changed") - return - basename=os.path.basename(abspath) - ext = os.path.splitext(abspath)[1] - - content="" - - processor=process_nothing - if ext in preprocess: - processor=preprocess[ext] - pagedatalist = processor(abspath) - - #TODO: assumes sqlitehas been built with thread safety (and it is the default) - cursor = dbcon.cursor() - cursor.execute("BEGIN TRANSACTION") - cursor.execute("DELETE FROM file WHERE path = ?", (abspath,)) - cursor.execute("INSERT INTO file(path, mtime) VALUES(?, ?) ", (abspath, mtime)) - fileid=cursor.lastrowid - for pagedata in pagedatalist: - cursor.execute("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", (fileid, pagedata.page, pagedata.content)) - cursor.execute("COMMIT TRANSACTION") - -preprocess={".pdf":process_pdf, ".odt":process_odt, ".ods":process_ods, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text, - ".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text, - ".py":process_text, '.md':process_text} - -def init(): - global dbcon - dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) - - -dbcon = None -if __name__ == '__main__': - with Pool(processes=4,initializer=init) as pool: - - if len(sys.argv) < 2: - pool.map(insert, (l.replace("\n", "") for l in sys.stdin)) - else: - pool.map(insert, sys.argv[1:]) - - - - - - diff --git a/cli/addfileexception.h b/cli/addfileexception.h new file mode 100644 index 0000000..6e2ccfc --- /dev/null +++ b/cli/addfileexception.h @@ -0,0 +1,8 @@ +#ifndef ADDFILEEXCEPTION_H +#define ADDFILEEXCEPTION_H +#include +#include +class AddFileException : public QException +{ +}; +#endif // ADDFILEEXCEPTION_H diff --git a/cli/cli.pro b/cli/cli.pro new file mode 100644 index 0000000..04732bd --- /dev/null +++ b/cli/cli.pro @@ -0,0 +1,46 @@ +QT -= gui +QT += sql concurrent +CONFIG += c++11 console +CONFIG -= app_bundle + +# The following define makes your compiler emit warnings if you use +# any feature of Qt which as been marked deprecated (the exact warnings +# depend on your compiler). Please consult the documentation of the +# deprecated API in order to know how to port your code away from it. +DEFINES += QT_DEPRECATED_WARNINGS + +# You can also make your code fail to compile if you use deprecated APIs. +# In order to do so, uncomment the following line. +# You can also select to disable deprecated APIs only up to a certain version of Qt. +#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0 +LIBS += -luchardet -lpoppler-qt5 -lquazip +SOURCES += \ + main.cpp \ + encodingdetector.cpp \ + processor.cpp \ + pdfprocessor.cpp \ + defaulttextprocessor.cpp \ + command.cpp \ + commandadd.cpp \ + tagstripperprocessor.cpp \ + nothingprocessor.cpp \ + odtprocessor.cpp \ + utils.cpp \ + odsprocessor.cpp \ + qssgeneralexception.cpp + +HEADERS += \ + encodingdetector.h \ + processor.h \ + pagedata.h \ + pdfprocessor.h \ + defaulttextprocessor.h \ + command.h \ + commandadd.h \ + tagstripperprocessor.h \ + nothingprocessor.h \ + odtprocessor.h \ + utils.h \ + odsprocessor.h \ + qssgeneralexception.h +INCLUDEPATH += /usr/include/poppler/qt5/ /usr/include/quazip5 diff --git a/cli/command.cpp b/cli/command.cpp new file mode 100644 index 0000000..9992995 --- /dev/null +++ b/cli/command.cpp @@ -0,0 +1,38 @@ +#include +#include +#include +#include "command.h" +#include "qssgeneralexception.h" + +bool Command::fileExistsInDatabase(QSqlDatabase &db, QString path, qint64 mtime) +{ + auto query = QSqlQuery("SELECT 1 FROM file WHERE path = ? and mtime = ?", db); + query.addBindValue(path); + query.addBindValue(mtime); + if(!query.exec()) + { + throw QSSGeneralException("Error while trying to query for file existance"); + } + if(!query.next()) + { + return false; + } + return query.value(0).toBool(); +} + +QSqlDatabase Command::dbConnection() +{ + if(dbStore.hasLocalData()) + { + return dbStore.localData(); + } + QSqlDatabase db = + QSqlDatabase::addDatabase("QSQLITE", "QSS" + QString::number((quint64)QThread::currentThread(), 16)); + db.setDatabaseName(this->dbConnectionString); + if(!db.open()) + { + qDebug() << "Failed to open the database: " << this->dbConnectionString; + } + dbStore.setLocalData(db); + return db; +} diff --git a/cli/command.h b/cli/command.h new file mode 100644 index 0000000..f3ae949 --- /dev/null +++ b/cli/command.h @@ -0,0 +1,27 @@ +#ifndef COMMAND_H +#define COMMAND_H +#include +#include +#include +#include +#include +class Command +{ + protected: + bool fileExistsInDatabase(QSqlDatabase &db, QString path, qint64 mtime); + QByteArray readFile(QString path) const; + QString dbConnectionString; + QThreadStorage dbStore; + + public: + Command(QString dbConnectionString) + { + this->dbConnectionString = dbConnectionString; + } + + QSqlDatabase dbConnection(); + virtual int handle(QStringList arguments) = 0; + virtual ~Command(){}; +}; + +#endif // COMMAND_H diff --git a/cli/commandadd.cpp b/cli/commandadd.cpp new file mode 100644 index 0000000..a000701 --- /dev/null +++ b/cli/commandadd.cpp @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "processor.h" +#include "pdfprocessor.h" +#include "commandadd.h" +#include "defaulttextprocessor.h" +#include "tagstripperprocessor.h" +#include "nothingprocessor.h" +#include "odtprocessor.h" +#include "odsprocessor.h" +#include "utils.h" +static DefaultTextProcessor *defaultTextProcessor = new DefaultTextProcessor(); +static TagStripperProcessor *tagStripperProcessor = new TagStripperProcessor(); +static NothingProcessor *nothingProcessor = new NothingProcessor(); +static OdtProcessor *odtProcessor = new OdtProcessor(); +static OdsProcessor *odsProcessor = new OdsProcessor(); + +static QMap processors{ + {"pdf", new PdfProcessor()}, {"txt", defaultTextProcessor}, {"md", defaultTextProcessor}, + {"py", defaultTextProcessor}, {"xml", nothingProcessor}, {"html", tagStripperProcessor}, + {"java", defaultTextProcessor}, {"js", defaultTextProcessor}, {"cpp", defaultTextProcessor}, + {"c", defaultTextProcessor}, {"sql", defaultTextProcessor}, {"odt", odtProcessor}, + {"ods", odsProcessor}}; + +AddFileResult CommandAdd::addFile(QString path) +{ + QFileInfo info(path); + QString absPath = info.absoluteFilePath(); + auto mtime = info.lastModified().toSecsSinceEpoch(); + QChar fileType = info.isDir() ? 'd' : 'f'; + + QSqlDatabase db = dbConnection(); + if(fileExistsInDatabase(db, absPath, mtime)) + { + return SKIPPED; + } + + Processor *processor = processors.value(info.suffix(), nothingProcessor); + QVector pageData; + if(processor->PREFERED_DATA_SOURCE == FILEPATH) + { + pageData = processor->process(absPath); + } + else + { + pageData = processor->process(Utils::readFile(absPath)); + } + + if(pageData.isEmpty()) + { + qDebug() << "Could not get any content for " << absPath; + } + + // Workaround to "database is locked" error. Perhaps try WAL mode etc. + // QMutexLocker locker(&writeMutex); + if(!db.transaction()) + { + qDebug() << "Failed to open transaction for " << absPath << " : " << db.lastError(); + return DBFAIL; + } + + QSqlQuery delQuery("DELETE FROM file WHERE path = ?", db); + delQuery.addBindValue(absPath); + if(!delQuery.exec()) + { + qDebug() << "Failed DELETE query" << delQuery.lastError(); + db.rollback(); + return DBFAIL; + } + + QSqlQuery inserterQuery("INSERT INTO file(path, mtime, size, filetype) VALUES(?, ?, ?, ?)", db); + inserterQuery.addBindValue(absPath); + inserterQuery.addBindValue(mtime); + inserterQuery.addBindValue(info.size()); + inserterQuery.addBindValue(fileType); + if(!inserterQuery.exec()) + { + qDebug() << "Failed INSERT query" << inserterQuery.lastError(); + db.rollback(); + return DBFAIL; + } + int lastid = inserterQuery.lastInsertId().toInt(); + for(PageData &data : pageData) + { + QSqlQuery contentQuery("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", db); + contentQuery.addBindValue(lastid); + contentQuery.addBindValue(data.pagenumber); + contentQuery.addBindValue(data.content); + if(!contentQuery.exec()) + { + db.rollback(); + qDebug() << "Failed content insertion " << contentQuery.lastError(); + return DBFAIL; + } + } + + if(!db.commit()) + { + db.rollback(); + qDebug() << "Failed to commit transaction for " << absPath << " : " << db.lastError(); + return DBFAIL; + } + return OK; +} + +int CommandAdd::handle(QStringList arguments) +{ + QCommandLineParser parser; + parser.addOptions({{{"c", "continue"}, "Continue adding files, don't exit on first error"}, + {{"a", "all"}, "On error, no files should be added, even already processed ones"}, + {{"v", "verbose"}, "Print skipped and added files"}, + {{"n", "threads"}, "Number of threads to use.", "threads"}}); + + parser.addHelpOption(); + parser.addPositionalArgument("paths", "List of paths to process/add to the index", "[paths...]"); + + parser.process(arguments); + bool keepGoing = false; + bool verbose = false; + if(parser.isSet("continue")) + { + keepGoing = true; + } + if(parser.isSet("verbose")) + { + verbose = true; + } + if(parser.isSet("all")) + { + throw QSSGeneralException("To be implemented"); + } + if(parser.isSet("threads")) + { + QString threadsCount = parser.value("threads"); + QThreadPool::globalInstance()->setMaxThreadCount(threadsCount.toInt()); + } + + QStringList files = parser.positionalArguments(); + + if(files.length() == 0) + { + QTextStream stream(stdin); + + while(!stream.atEnd()) + { + QString path = stream.readLine(); + files.append(path); + } + } + + bool terminate = false; + QtConcurrent::blockingMap(files, + [&](QString &path) + { + if(terminate) + { + return; + } + if(verbose) + { + qDebug() << "Processing " << path; + } + auto result = addFile(path); + if(result == DBFAIL) + { + qDebug() << "Failed to add " << path; + if(!keepGoing) + { + terminate = true; + } + } + if(verbose) + { + if(result == SKIPPED) + { + qDebug() << "SKIPPED" << path << "as it already exists in the database"; + } + else + { + qDebug() << "Added" << path; + } + } + }); + + return 0; +} diff --git a/cli/commandadd.h b/cli/commandadd.h new file mode 100644 index 0000000..f62bddf --- /dev/null +++ b/cli/commandadd.h @@ -0,0 +1,24 @@ +#ifndef COMMANDADD_H +#define COMMANDADD_H +#include +#include "command.h" +enum AddFileResult +{ + OK, + SKIPPED, + DBFAIL +}; + +class CommandAdd : public Command +{ + private: + AddFileResult addFile(QString path); + QMutex writeMutex; + + public: + using Command::Command; + + int handle(QStringList arguments) override; +}; + +#endif // COMMANDADD_H diff --git a/cli/defaulttextprocessor.cpp b/cli/defaulttextprocessor.cpp new file mode 100644 index 0000000..c393486 --- /dev/null +++ b/cli/defaulttextprocessor.cpp @@ -0,0 +1,30 @@ +#include +#include +#include +#include +#include "defaulttextprocessor.h" + +DefaultTextProcessor::DefaultTextProcessor() +{ +} + +QString DefaultTextProcessor::processText(const QByteArray &data) const +{ + QString encoding = encodingDetector.detectEncoding(data); + if(!encoding.isEmpty()) + { + QTextCodec *codec = QTextCodec::codecForName(encoding.toUtf8()); + if(codec != nullptr) + { + return codec->toUnicode(data); + } + qWarning() << "No codec found for " << encoding; + return QString(data); + } + return {}; +} + +QVector DefaultTextProcessor::process(const QByteArray &data) const +{ + return {{0, processText(data)}}; +} diff --git a/cli/defaulttextprocessor.h b/cli/defaulttextprocessor.h new file mode 100644 index 0000000..13d1061 --- /dev/null +++ b/cli/defaulttextprocessor.h @@ -0,0 +1,17 @@ +#ifndef DEFAULTTEXTPROCESSOR_H +#define DEFAULTTEXTPROCESSOR_H + +#include "processor.h" +#include "encodingdetector.h" +class DefaultTextProcessor : public Processor +{ + protected: + EncodingDetector encodingDetector; + + public: + DefaultTextProcessor(); + QString processText(const QByteArray &data) const; + QVector process(const QByteArray &data) const override; +}; + +#endif // DEFAULTTEXTPROCESSOR_H diff --git a/cli/encodingdetector.cpp b/cli/encodingdetector.cpp new file mode 100644 index 0000000..5c4af20 --- /dev/null +++ b/cli/encodingdetector.cpp @@ -0,0 +1,45 @@ +#include +#include "encodingdetector.h" +#include +EncodingDetector::EncodingDetector() +{ +} + +QString EncodingDetector::detectEncoding(const QByteArray &data) const +{ + uchardet_t detector = uchardet_new(); + if(uchardet_handle_data(detector, data.data(), data.size()) != 0) + { + uchardet_delete(detector); + throw QSSGeneralException("Decoder failed"); + } + uchardet_data_end(detector); + QString encoding = uchardet_get_charset(detector); + uchardet_delete(detector); + return encoding; +} +QString EncodingDetector::detectEncoding(QDataStream &s) const +{ + uchardet_t detector = uchardet_new(); + + char buffer[4096]; + int n; + while((n = s.readRawData(buffer, sizeof(buffer))) > 0) + { + if(uchardet_handle_data(detector, buffer, n) != 0) + { + uchardet_delete(detector); + + throw QSSGeneralException("Decoder failed"); + } + } + if(n == -1) + { + uchardet_delete(detector); + throw QSSGeneralException("Read failed"); + } + uchardet_data_end(detector); + QString encoding = uchardet_get_charset(detector); + uchardet_delete(detector); + return encoding; +} diff --git a/cli/encodingdetector.h b/cli/encodingdetector.h new file mode 100644 index 0000000..ebc5ab3 --- /dev/null +++ b/cli/encodingdetector.h @@ -0,0 +1,14 @@ +#ifndef ENCODINGDETECTOR_H +#define ENCODINGDETECTOR_H +#include +#include +class EncodingDetector +{ + + public: + EncodingDetector(); + QString detectEncoding(const QByteArray &data) const; + QString detectEncoding(QDataStream &s) const; +}; + +#endif // ENCODINGDETECTOR_H diff --git a/cli/main.cpp b/cli/main.cpp new file mode 100644 index 0000000..5a9138d --- /dev/null +++ b/cli/main.cpp @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "encodingdetector.h" +#include "pdfprocessor.h" +#include "defaulttextprocessor.h" +#include "command.h" +#include "commandadd.h" +void printUsage(QString argv0) +{ + qInfo() << "Usage: " << argv0 << "command"; +} + +Command *commandFromName(QString name, QString connectionstring) +{ + if(name == "add") + { + return new CommandAdd(connectionstring); + } + if(name == "delete") + { + } + if(name == "update") + { + } + if(name == "search") + { + } + return nullptr; +} + +int main(int argc, char *argv[]) +{ + QCoreApplication app(argc, argv); + + QStringList args = app.arguments(); + QString argv0 = args.takeFirst(); + if(args.length() < 1) + { + printUsage(argv0); + exit(1); + } + + QString commandName = args.first(); + Command *cmd = commandFromName(commandName, QProcessEnvironment::systemEnvironment().value("QSS_PATH")); + if(cmd != nullptr) + { + try + { + return cmd->handle(args); + } + catch(const QSSGeneralException &e) + { + qDebug() << "Exception caught, message: " << e.message; + } + } + else + { + qDebug() << "Unknown command " << commandName; + } + return 1; +} diff --git a/cli/nothingprocessor.cpp b/cli/nothingprocessor.cpp new file mode 100644 index 0000000..d76c924 --- /dev/null +++ b/cli/nothingprocessor.cpp @@ -0,0 +1,5 @@ +#include "nothingprocessor.h" + +NothingProcessor::NothingProcessor() +{ +} diff --git a/cli/nothingprocessor.h b/cli/nothingprocessor.h new file mode 100644 index 0000000..0a1d4c0 --- /dev/null +++ b/cli/nothingprocessor.h @@ -0,0 +1,19 @@ +#ifndef NOTHINGPROCESSOR_H +#define NOTHINGPROCESSOR_H +#include +#include "processor.h" +#include "pagedata.h" + +class NothingProcessor : public Processor +{ + public: + NothingProcessor(); + + public: + QVector process(const QByteArray &data) const override + { + return {}; + } +}; + +#endif // NOTHINGPROCESSOR_H diff --git a/cli/odsprocessor.cpp b/cli/odsprocessor.cpp new file mode 100644 index 0000000..d7e5d29 --- /dev/null +++ b/cli/odsprocessor.cpp @@ -0,0 +1,5 @@ +#include "odsprocessor.h" + +OdsProcessor::OdsProcessor() +{ +} diff --git a/cli/odsprocessor.h b/cli/odsprocessor.h new file mode 100644 index 0000000..3bbc29c --- /dev/null +++ b/cli/odsprocessor.h @@ -0,0 +1,10 @@ +#ifndef ODSPROCESSOR_H +#define ODSPROCESSOR_H +#include "odtprocessor.h" +class OdsProcessor : public OdtProcessor +{ + public: + OdsProcessor(); +}; + +#endif // ODSPROCESSOR_H diff --git a/cli/odtprocessor.cpp b/cli/odtprocessor.cpp new file mode 100644 index 0000000..884da70 --- /dev/null +++ b/cli/odtprocessor.cpp @@ -0,0 +1,26 @@ +#include +#include +#include "odtprocessor.h" +#include "tagstripperprocessor.h" + +QVector OdtProcessor::process(const QByteArray &data) const +{ + throw QSSGeneralException("Not implemented yet"); +} + +QVector OdtProcessor::process(QString path) const +{ + QuaZipFile zipFile(path); + zipFile.setFileName("content.xml"); + if(!zipFile.open(QIODevice::ReadOnly)) + { + throw QSSGeneralException("Error while opening file " + path); + } + QByteArray entireContent = zipFile.readAll(); + if(entireContent.isEmpty()) + { + throw QSSGeneralException("Error while reading content.xml of " + path); + } + TagStripperProcessor tsp; + return tsp.process(entireContent); +} diff --git a/cli/odtprocessor.h b/cli/odtprocessor.h new file mode 100644 index 0000000..cef4083 --- /dev/null +++ b/cli/odtprocessor.h @@ -0,0 +1,16 @@ +#ifndef ODTPROCESSOR_H +#define ODTPROCESSOR_H +#include "processor.h" +class OdtProcessor : public Processor +{ + public: + OdtProcessor() + { + this->PREFERED_DATA_SOURCE = FILEPATH; + } + QVector process(const QByteArray &data) const override; + + QVector process(QString path) const override; +}; + +#endif // ODTPROCESSOR_H diff --git a/cli/pagedata.h b/cli/pagedata.h new file mode 100644 index 0000000..d550116 --- /dev/null +++ b/cli/pagedata.h @@ -0,0 +1,19 @@ +#ifndef PAGEDATA_H +#define PAGEDATA_H +#include +class PageData +{ + public: + unsigned int pagenumber = 0; + QString content; + + PageData() + { + } + PageData(unsigned int pagenumber, QString content) + { + this->pagenumber = pagenumber; + this->content = content; + } +}; +#endif // PAGEDATA_H diff --git a/cli/pdfprocessor.cpp b/cli/pdfprocessor.cpp new file mode 100644 index 0000000..ec20c9e --- /dev/null +++ b/cli/pdfprocessor.cpp @@ -0,0 +1,32 @@ +#include +#include +#include "pdfprocessor.h" +PdfProcessor::PdfProcessor() +{ +} + +QVector PdfProcessor::process(const QByteArray &data) const +{ + QVector result; + QScopedPointer doc(Poppler::Document::loadFromData(data)); + if(doc.isNull()) + { + return {}; + } + QRectF entirePage; + + auto pagecount = doc->numPages(); + QString entire; + entire.reserve(data.size()); // TODO too much + for(auto i = 0; i < pagecount; i++) + { + QString text = doc->page(i)->text(entirePage); + result.append({static_cast(i + 1), text}); + /*TODO: hack, so we can fts search several words over the whole document, not just pages. + * this of course uses more space and should be solved differently. + */ + entire += text; + } + result.append({0, entire}); + return result; +} diff --git a/cli/pdfprocessor.h b/cli/pdfprocessor.h new file mode 100644 index 0000000..4802879 --- /dev/null +++ b/cli/pdfprocessor.h @@ -0,0 +1,13 @@ +#ifndef PDFPROCESSOR_H +#define PDFPROCESSOR_H +#include "processor.h" +class PdfProcessor : public Processor +{ + public: + PdfProcessor(); + + public: + QVector process(const QByteArray &data) const override; +}; + +#endif // PDFPROCESSOR_H diff --git a/cli/processor.cpp b/cli/processor.cpp new file mode 100644 index 0000000..f16cb36 --- /dev/null +++ b/cli/processor.cpp @@ -0,0 +1,5 @@ +#include "processor.h" + +Processor::Processor() +{ +} diff --git a/cli/processor.h b/cli/processor.h new file mode 100644 index 0000000..04bd04a --- /dev/null +++ b/cli/processor.h @@ -0,0 +1,32 @@ +#ifndef PROCESSOR_H +#define PROCESSOR_H +#include +#include +#include "pagedata.h" +#include "utils.h" +enum DataSource +{ + FILEPATH, + ARRAY +}; + +class Processor +{ + public: + /* Indicates the data source the processor performs best with. For example, + * you do not want to read the entire of a compressed archive just to get the content of + * a single file */ + DataSource PREFERED_DATA_SOURCE = ARRAY; + Processor(); + virtual QVector process(const QByteArray &data) const = 0; + virtual QVector process(QString path) const + { + return process(Utils::readFile(path)); + } + + virtual ~Processor() + { + } +}; + +#endif // PROCESSOR_H diff --git a/cli/qssgeneralexception.cpp b/cli/qssgeneralexception.cpp new file mode 100644 index 0000000..0901f06 --- /dev/null +++ b/cli/qssgeneralexception.cpp @@ -0,0 +1 @@ +#include "qssgeneralexception.h" diff --git a/cli/qssgeneralexception.h b/cli/qssgeneralexception.h new file mode 100644 index 0000000..bef5a2a --- /dev/null +++ b/cli/qssgeneralexception.h @@ -0,0 +1,24 @@ +#ifndef QSSGENERALEXCEPTION_H +#define QSSGENERALEXCEPTION_H + +#include + +class QSSGeneralException : public QException +{ + public: + QString message; + QSSGeneralException(QString message) + { + this->message = message; + }; + void raise() const override + { + throw *this; + } + QSSGeneralException *clone() const override + { + return new QSSGeneralException(*this); + } +}; + +#endif // QSSGENERALEXCEPTION_H diff --git a/cli/tagstripperprocessor.cpp b/cli/tagstripperprocessor.cpp new file mode 100644 index 0000000..1cef41d --- /dev/null +++ b/cli/tagstripperprocessor.cpp @@ -0,0 +1,14 @@ +#include "tagstripperprocessor.h" + +TagStripperProcessor::TagStripperProcessor() +{ +} + +QVector TagStripperProcessor::process(const QByteArray &data) const +{ + auto result = DefaultTextProcessor::process(data); + // TODO: does not work properly with
and does not deal with entities... + + result[0].content.remove(QRegExp("<[^>]*>")); + return result; +} diff --git a/cli/tagstripperprocessor.h b/cli/tagstripperprocessor.h new file mode 100644 index 0000000..4e485d1 --- /dev/null +++ b/cli/tagstripperprocessor.h @@ -0,0 +1,14 @@ +#ifndef XMLSTRIPPERPROCESSOR_H +#define XMLSTRIPPERPROCESSOR_H +#include "defaulttextprocessor.h" + +class TagStripperProcessor : public DefaultTextProcessor +{ + public: + TagStripperProcessor(); + + public: + QVector process(const QByteArray &data) const override; +}; + +#endif // XMLSTRIPPERPROCESSOR_H diff --git a/cli/utils.cpp b/cli/utils.cpp new file mode 100644 index 0000000..802af30 --- /dev/null +++ b/cli/utils.cpp @@ -0,0 +1,20 @@ +#include +#include "utils.h" +Utils::Utils() +{ +} + +QByteArray Utils::readFile(QString path) +{ + QFile file(path); + if(!file.open(QIODevice::ReadOnly)) + { + throw QSSGeneralException("Failed to open file: " + path); + } + QByteArray data = file.readAll(); + if(data.isEmpty() && file.error() != QFileDevice::FileError::NoError) + { + throw QSSGeneralException("Error reading file: " + path + ", Error: " + file.error()); + } + return data; +} diff --git a/cli/utils.h b/cli/utils.h new file mode 100644 index 0000000..ee5143a --- /dev/null +++ b/cli/utils.h @@ -0,0 +1,15 @@ +#ifndef UTILS_H +#define UTILS_H +#include +#include +#include +#include "qssgeneralexception.h" + +class Utils +{ + public: + Utils(); + static QByteArray readFile(QString path); +}; + +#endif // UTILS_H diff --git a/config.py b/config.py deleted file mode 100644 index e8408e8..0000000 --- a/config.py +++ /dev/null @@ -1,5 +0,0 @@ -import os -DBPATH=os.getenv("QSS_PATH") -if DBPATH == None or DBPATH == "": - print("MIssing env var") - exit(1) diff --git a/delindex b/delindex deleted file mode 100755 index 73679c4..0000000 --- a/delindex +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh -TEMPFILE=$(mktemp) -DBFILE="$QSS_PATH" -function todelete() -{ - echo "DELETE FROM file WHERE path = '$1';" >> /"$TEMPFILE" -} - -echo "BEGIN TRANSACTION;" >> /"$TEMPFILE" - -sqlite3 "$DBFILE" "SELECT path FROM file;"| while read line ; do -[ -e "$line" ] || todelete "$line" -done - -echo "COMMIT TRANSACTION;" >> /"$TEMPFILE" - -sqlite3 "$DBFILE" < /"$TEMPFILE" - - - - diff --git a/searchindex b/searchindex deleted file mode 100755 index 35e4da8..0000000 --- a/searchindex +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/python3 -import sqlite3 -import sys -import config - -dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) -cursor = dbcon.cursor() - -if len(sys.argv) < 2: - print("Error: Missing search") - -search=sys.argv[1:] -#TODO: machien parseable -for row in cursor.execute("SELECT file.path, content.page FROM file INNER JOIN content ON file.id = content.fileid INNER JOIN content_fts ON content.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)): - print("File:", row[0], "Page: ", row[1]) -dbcon.close() - - - - -