begin groundwork to save pages in docs, not just everything

This commit is contained in:
Albert S. 2018-08-09 16:37:30 +02:00
parent cfc8659692
commit 58c7a52585
3 changed files with 52 additions and 38 deletions

View File

@ -10,6 +10,20 @@ import chardet
import config import config
dbcon = sqlite3.connect(config.DBPATH, isolation_level=None) dbcon = sqlite3.connect(config.DBPATH, isolation_level=None)
class pagedata:
page = 0
content = ""
def singlepagelist(content):
result = pagedata()
result.content = content
result.page = 0
l = list();
l.append(result)
return l
def striptags(content): def striptags(content):
result = "" result = ""
try: try:
@ -30,17 +44,17 @@ def process_pdf(path):
args=["pdftotext", path , "-"] args=["pdftotext", path , "-"]
stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
result = strip_irrelevant(stdout.decode('utf-8')) result = strip_irrelevant(stdout.decode('utf-8'))
return result return singlepagelist(result)
def process_odt(path): def process_odt(path):
fd = zipfile.ZipFile(path) fd = zipfile.ZipFile(path)
content = fd.read("content.xml").decode("utf-8") content = fd.read("content.xml").decode("utf-8")
fd.close() fd.close()
return striptags(content) return singlepagelist(striptags(content))
def process_striptags(path): def process_striptags(path):
content = process_text(path) content = process_text(path)
return striptags(content) return singlepagelist(striptags(content))
def process_text(path): def process_text(path):
fd = open(path, "rb") fd = open(path, "rb")
@ -49,21 +63,22 @@ def process_text(path):
result="" result=""
try: try:
return str(content.decode("utf-8")) result = str(content.decode("utf-8"))
except: except:
pass
try: try:
encoding = chardet.detect(content)["encoding"]; encoding = chardet.detect(content)["encoding"];
if encoding == None: if encoding == None:
return "" result = ""
else:
result = str(content.decode(encoding)) result = str(content.decode(encoding))
except: except:
print("FAILE DECODING: " + path) print("FAILE DECODING: " + path)
return "" result = ""
return result
return singlepagelist(result)
def process_nothing(path): def process_nothing(path):
return "" return list()
def exists(abspath, mtime): def exists(abspath, mtime):
cursor = dbcon.cursor() cursor = dbcon.cursor()
@ -89,28 +104,27 @@ def insert(path, cursor):
processor=process_nothing processor=process_nothing
if ext in preprocess: if ext in preprocess:
processor=preprocess[ext] processor=preprocess[ext]
content = processor(abspath) pagedatalist = processor(abspath)
#if update: cursor.execute("BEGIN TRANSACTION")
# cursor.execute("UPDATE file SET path = ?, mtime = ?, content = cursor.execute("DELETE FROM file WHERE path = ?", (abspath,))
cursor.execute("INSERT OR REPLACE INTO file(path, mtime, content) VALUES(?, ?, ?) ", (abspath, mtime, content)) cursor.execute("INSERT INTO file(path, mtime) VALUES(?, ?) ", (abspath, mtime))
fileid=cursor.lastrowid
for pagedata in pagedatalist:
cursor.execute("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", (fileid, pagedata.page, pagedata.content))
cursor.execute("COMMIT TRANSACTION")
preprocess={".pdf":process_pdf, ".odt":process_odt, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text, preprocess={".pdf":process_pdf, ".odt":process_odt, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text,
".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text, ".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text,
".py":process_text, '.md':process_text} ".py":process_text, '.md':process_text}
cursor = dbcon.cursor() cursor = dbcon.cursor()
cursor.execute("BEGIN TRANSACTION")
if len(sys.argv) < 2: if len(sys.argv) < 2:
for line in sys.stdin: for line in sys.stdin:
insert(line.replace("\n", ""), cursor) insert(line.replace("\n", ""), cursor)
else: else:
for inputfile in sys.argv[1:]: for inputfile in sys.argv[1:]:
insert(inputfile, cursor) insert(inputfile, cursor)
cursor.execute("COMMIT TRANSACTION")
dbcon.close() dbcon.close()

View File

@ -1,15 +1,15 @@
-- Create a table. And an external content fts5 table to index it. CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer);
CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer, content text); CREATE TABLE content(id INTEGER PRIMARY KEY, fileid INTEGER REFERENCES file (id) ON DELETE CASCADE, page integer, content text);
CREATE VIRTUAL TABLE file_fts USING fts5(content, content='file', content_rowid='id');
-- Triggers to keep the FTS index up to date. CREATE VIRTUAL TABLE content_fts USING fts5(content, content='content', content_rowid='id');
CREATE TRIGGER file_ai AFTER INSERT ON file BEGIN
INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content); CREATE TRIGGER contents_ai AFTER INSERT ON content BEGIN
INSERT INTO content_fts(rowid, content) VALUES (new.id, new.content);
END; END;
CREATE TRIGGER file_ad AFTER DELETE ON file BEGIN CREATE TRIGGER contents_ad AFTER DELETE ON content BEGIN
INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content); INSERT INTO content_fts(content_fts, rowid, content) VALUES('delete', old.id, old.content);
END; END;
CREATE TRIGGER file_au AFTER UPDATE ON file BEGIN CREATE TRIGGER contents_au AFTER UPDATE ON content BEGIN
INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content); INSERT INTO content_fts(content_fts, rowid, content) VALUES('delete', old.id, old.content);
INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content); INSERT INTO content_fts(rowid, content) VALUES (new.id, new.content);
END; END;

View File

@ -10,9 +10,9 @@ if len(sys.argv) < 2:
print("Error: Missing search") print("Error: Missing search")
search=sys.argv[1:] search=sys.argv[1:]
#TODO: machien parseable
for row in cursor.execute("SELECT file.path FROM file INNER JOIN file_fts ON file.id = file_fts.ROWID WHERE file_fts.content MATCH ? ORDER By file.mtime ASC", (search)): for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
print(row[0]) print("File:", row[0], "Page: ", row[1])
dbcon.close() dbcon.close()