From 364d56f5d478086bfc7efc2a1cc50b3447d4cab9 Mon Sep 17 00:00:00 2001 From: Albert S Date: Wed, 3 Jan 2018 09:40:13 +0100 Subject: [PATCH] first commit --- README.md | 18 ++++++++ addindex | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++ config.py | 1 + create.sql | 15 +++++++ delindex | 21 ++++++++++ searchindex | 21 ++++++++++ 6 files changed, 194 insertions(+) create mode 100644 README.md create mode 100755 addindex create mode 100644 config.py create mode 100644 create.sql create mode 100755 delindex create mode 100755 searchindex diff --git a/README.md b/README.md new file mode 100644 index 0000000..23d76e4 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +easyindex +========= +easyindex creates a poor-man full-text search for your files using a +sqlite database. + +You need the python "chardet" package, since it will try to convert the +encoding of the files in case initial utf-8 decoding fails. + +pdftext is needed to search in .pdf files.. + +No GUI is provided at this time, nor does it concern itself with search +too much. + +Setup +----- +sqlite3 easyindex.db < create.sql + + diff --git a/addindex b/addindex new file mode 100755 index 0000000..5d8df5a --- /dev/null +++ b/addindex @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +import sqlite3 +import os.path +import sys +import subprocess +import zipfile +import xml.etree.ElementTree +import re +import chardet +import config +dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) + +def striptags(content): + result = "" + try: + result = ''.join(xml.etree.ElementTree.fromstring(content).itertext()) + except: + #TODO: test
test2 will make it testtest2 not test test2 + result = re.sub('<[^>]*>', '', content) + + return result + + +def strip_irrelevant(content): + result = content.replace("\n", " ").replace("\t", " ").replace("\f", "") + result = re.sub(' +', ' ', result) + return result; + +def process_pdf(path): + args=["pdftotext", path , "-"] + stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + result = strip_irrelevant(stdout.decode('utf-8')) + return result + +def process_odt(path): + fd = zipfile.ZipFile(path) + content = fd.read("content.xml").decode("utf-8") + fd.close() + return striptags(content) + +def process_striptags(path): + content = process_text(path) + return striptags(content) + +def process_text(path): + fd = open(path, "rb") + content = fd.read() + fd.close() + + result="" + try: + return str(content.decode("utf-8")) + except: + pass + try: + encoding = chardet.detect(content)["encoding"]; + if encoding == None: + return "" + result = str(content.decode(encoding)) + except: + print("FAILE DECODING: " + path) + return "" + return result + +def process_nothing(path): + return "" + +def exists(abspath, mtime): + cursor = dbcon.cursor() + cursor.execute("SELECT 1 FROM file WHERE path = ? AND mtime = ?" , (abspath, mtime)) + result = cursor.fetchone() + if result != None and result[0] == 1: + return True + return False + +def insert(path, cursor): + print("processing", path) + abspath=os.path.abspath(path) + mtime = int(os.stat(abspath).st_mtime) + + if exists(abspath, mtime): + print("Leaving alone " + abspath + " because it wasn't changed") + return + basename=os.path.basename(abspath) + ext = os.path.splitext(abspath)[1] + + content="" + + processor=process_nothing + if ext in preprocess: + processor=preprocess[ext] + content = processor(abspath) + + #if update: + # cursor.execute("UPDATE file SET path = ?, mtime = ?, content = + cursor.execute("INSERT OR REPLACE INTO file(path, mtime, content) VALUES(?, ?, ?) ", (abspath, mtime, content)) + +preprocess={".pdf":process_pdf, ".odt":process_odt, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text, + ".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text, + ".py":process_text, '.md':process_text} + +cursor = dbcon.cursor() +cursor.execute("BEGIN TRANSACTION") + +if len(sys.argv) < 2: + for line in sys.stdin: + insert(line.replace("\n", ""), cursor) +else: + for inputfile in sys.argv[1:]: + insert(inputfile, cursor) + +cursor.execute("COMMIT TRANSACTION") + +dbcon.close() + + + + diff --git a/config.py b/config.py new file mode 100644 index 0000000..d974266 --- /dev/null +++ b/config.py @@ -0,0 +1 @@ +DBPATH="/home/db/easyindex.sqlite" diff --git a/create.sql b/create.sql new file mode 100644 index 0000000..a560ed5 --- /dev/null +++ b/create.sql @@ -0,0 +1,15 @@ +-- Create a table. And an external content fts5 table to index it. +CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer, content text); +CREATE VIRTUAL TABLE file_fts USING fts5(content, content='file', content_rowid='id'); + +-- Triggers to keep the FTS index up to date. +CREATE TRIGGER file_ai AFTER INSERT ON file BEGIN + INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content); +END; +CREATE TRIGGER file_ad AFTER DELETE ON file BEGIN + INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content); +END; +CREATE TRIGGER file_au AFTER UPDATE ON file BEGIN + INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content); + INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content); +END; diff --git a/delindex b/delindex new file mode 100755 index 0000000..901e8ee --- /dev/null +++ b/delindex @@ -0,0 +1,21 @@ +#!/bin/sh +TEMPFILE=$(mktemp) +DBFILE="/home/db/easyindex.sqlite" +function todelete() +{ + echo "DELETE FROM file WHERE path = '$1';" >> /"$TEMPFILE" +} + +echo "BEGIN TRANSACTION;" >> /"$TEMPFILE" + +sqlite3 "$DBFILE" "SELECT path FROM file;"| while read line ; do +[ -e "$line" ] || todelete "$line" +done + +echo "COMMIT TRANSACTION;" >> /"$TEMPFILE" + +sqlite3 "$DBFILE" < /"$TEMPFILE" + + + + diff --git a/searchindex b/searchindex new file mode 100755 index 0000000..930a0a9 --- /dev/null +++ b/searchindex @@ -0,0 +1,21 @@ +#!/usr/bin/python3 +import sqlite3 +import sys +import config + +dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) +cursor = dbcon.cursor() + +if len(sys.argv) < 2: + print("Error: Missing search") + +search=sys.argv[1:] + +for row in cursor.execute("SELECT file.path FROM file INNER JOIN file_fts ON file.id = file_fts.ROWID WHERE file_fts.content MATCH ? ORDER By file.mtime ASC", (search)): + print(row[0]) +dbcon.close() + + + + +