diff --git a/addindex b/addindex index 5d8df5a..8d0b917 100755 --- a/addindex +++ b/addindex @@ -10,6 +10,20 @@ import chardet import config dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) + +class pagedata: + page = 0 + content = "" + + +def singlepagelist(content): + result = pagedata() + result.content = content + result.page = 0 + l = list(); + l.append(result) + return l + def striptags(content): result = "" try: @@ -30,17 +44,17 @@ def process_pdf(path): args=["pdftotext", path , "-"] stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() result = strip_irrelevant(stdout.decode('utf-8')) - return result + return singlepagelist(result) def process_odt(path): fd = zipfile.ZipFile(path) content = fd.read("content.xml").decode("utf-8") fd.close() - return striptags(content) + return singlepagelist(striptags(content)) def process_striptags(path): content = process_text(path) - return striptags(content) + return singlepagelist(striptags(content)) def process_text(path): fd = open(path, "rb") @@ -49,21 +63,22 @@ def process_text(path): result="" try: - return str(content.decode("utf-8")) + result = str(content.decode("utf-8")) except: - pass - try: - encoding = chardet.detect(content)["encoding"]; - if encoding == None: - return "" - result = str(content.decode(encoding)) - except: - print("FAILE DECODING: " + path) - return "" - return result + try: + encoding = chardet.detect(content)["encoding"]; + if encoding == None: + result = "" + else: + result = str(content.decode(encoding)) + except: + print("FAILE DECODING: " + path) + result = "" + + return singlepagelist(result) def process_nothing(path): - return "" + return list() def exists(abspath, mtime): cursor = dbcon.cursor() @@ -89,28 +104,27 @@ def insert(path, cursor): processor=process_nothing if ext in preprocess: processor=preprocess[ext] - content = processor(abspath) + pagedatalist = processor(abspath) - #if update: - # cursor.execute("UPDATE file SET path = ?, mtime = ?, content = - cursor.execute("INSERT OR REPLACE INTO file(path, mtime, content) VALUES(?, ?, ?) ", (abspath, mtime, content)) + cursor.execute("BEGIN TRANSACTION") + cursor.execute("DELETE FROM file WHERE path = ?", (abspath,)) + cursor.execute("INSERT INTO file(path, mtime) VALUES(?, ?) ", (abspath, mtime)) + fileid=cursor.lastrowid + for pagedata in pagedatalist: + cursor.execute("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", (fileid, pagedata.page, pagedata.content)) + cursor.execute("COMMIT TRANSACTION") preprocess={".pdf":process_pdf, ".odt":process_odt, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text, ".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text, ".py":process_text, '.md':process_text} cursor = dbcon.cursor() -cursor.execute("BEGIN TRANSACTION") - if len(sys.argv) < 2: for line in sys.stdin: insert(line.replace("\n", ""), cursor) else: for inputfile in sys.argv[1:]: insert(inputfile, cursor) - -cursor.execute("COMMIT TRANSACTION") - dbcon.close() diff --git a/create.sql b/create.sql index a560ed5..a55b472 100644 --- a/create.sql +++ b/create.sql @@ -1,15 +1,15 @@ --- Create a table. And an external content fts5 table to index it. -CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer, content text); -CREATE VIRTUAL TABLE file_fts USING fts5(content, content='file', content_rowid='id'); +CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer); +CREATE TABLE content(id INTEGER PRIMARY KEY, fileid INTEGER REFERENCES file (id) ON DELETE CASCADE, page integer, content text); --- Triggers to keep the FTS index up to date. -CREATE TRIGGER file_ai AFTER INSERT ON file BEGIN - INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content); +CREATE VIRTUAL TABLE content_fts USING fts5(content, content='content', content_rowid='id'); + +CREATE TRIGGER contents_ai AFTER INSERT ON content BEGIN + INSERT INTO content_fts(rowid, content) VALUES (new.id, new.content); END; -CREATE TRIGGER file_ad AFTER DELETE ON file BEGIN - INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content); +CREATE TRIGGER contents_ad AFTER DELETE ON content BEGIN + INSERT INTO content_fts(content_fts, rowid, content) VALUES('delete', old.id, old.content); END; -CREATE TRIGGER file_au AFTER UPDATE ON file BEGIN - INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content); - INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content); +CREATE TRIGGER contents_au AFTER UPDATE ON content BEGIN + INSERT INTO content_fts(content_fts, rowid, content) VALUES('delete', old.id, old.content); + INSERT INTO content_fts(rowid, content) VALUES (new.id, new.content); END; diff --git a/searchindex b/searchindex index 930a0a9..258a243 100755 --- a/searchindex +++ b/searchindex @@ -10,9 +10,9 @@ if len(sys.argv) < 2: print("Error: Missing search") search=sys.argv[1:] - -for row in cursor.execute("SELECT file.path FROM file INNER JOIN file_fts ON file.id = file_fts.ROWID WHERE file_fts.content MATCH ? ORDER By file.mtime ASC", (search)): - print(row[0]) +#TODO: machien parseable +for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)): + print("File:", row[0], "Page: ", row[1]) dbcon.close()