begin groundwork to save pages in docs, not just everything

2018-08-09 16:37:30 +02:00
commit 58c7a52585
--- a/62
+++ b/62
@@ -10,6 +10,20 @@ import chardet
 import config
 dbcon = sqlite3.connect(config.DBPATH, isolation_level=None)

+
+class pagedata:
+	page = 0
+	content = ""
+	
+	
+def singlepagelist(content):
+	result = pagedata()
+	result.content = content
+	result.page = 0
+	l = list();
+	l.append(result)
+	return l
+
 def striptags(content):
 	result = ""
 	try:
@@ -30,17 +44,17 @@ def process_pdf(path):
 	args=["pdftotext", path , "-"]
 	stdout,stderr =  subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
 	result = strip_irrelevant(stdout.decode('utf-8'))
-	return result
+	return singlepagelist(result)
 	
 def process_odt(path):
 	fd = zipfile.ZipFile(path)
 	content = fd.read("content.xml").decode("utf-8")
 	fd.close()
-	return striptags(content)
+	return singlepagelist(striptags(content))
 	
 def process_striptags(path):
 	content = process_text(path)
-	return striptags(content)
+	return singlepagelist(striptags(content))
 	
 def process_text(path):
 	fd = open(path, "rb")
@@ -49,21 +63,22 @@ def process_text(path):

 	result=""
 	try:
-		return str(content.decode("utf-8"))
+		result = str(content.decode("utf-8"))
 	except:
-		pass
-	try:
-		encoding = chardet.detect(content)["encoding"];
-		if encoding == None:
-			return ""
-		result = str(content.decode(encoding))
-	except:
-		print("FAILE DECODING: " + path)
-		return ""
-	return result
+		try:
+			encoding = chardet.detect(content)["encoding"];
+			if encoding == None:
+				result = ""
+			else:
+				result = str(content.decode(encoding))
+		except:
+			print("FAILE DECODING: " + path)
+			result = ""
+		
+	return singlepagelist(result)
 	
 def process_nothing(path):
-	return ""
+	return list()
 	
 def exists(abspath, mtime):
 	cursor = dbcon.cursor()
@@ -89,28 +104,27 @@ def insert(path, cursor):
 	processor=process_nothing
 	if ext in preprocess:
 		processor=preprocess[ext]
-	content = processor(abspath)
+	pagedatalist = processor(abspath)

-	#if update:
-	#	cursor.execute("UPDATE file SET path = ?, mtime = ?, content =
-	cursor.execute("INSERT OR REPLACE INTO file(path, mtime, content) VALUES(?, ?, ?) ", (abspath, mtime, content))
+	cursor.execute("BEGIN TRANSACTION")
+	cursor.execute("DELETE FROM file WHERE path = ?", (abspath,))
+	cursor.execute("INSERT INTO file(path, mtime) VALUES(?, ?) ", (abspath, mtime))
+	fileid=cursor.lastrowid
+	for pagedata in pagedatalist:
+		cursor.execute("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", (fileid, pagedata.page, pagedata.content))
+	cursor.execute("COMMIT TRANSACTION")

 preprocess={".pdf":process_pdf, ".odt":process_odt, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text, 
 			".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text, 
 			".py":process_text, '.md':process_text}	

 cursor = dbcon.cursor()
-cursor.execute("BEGIN TRANSACTION")
-
 if len(sys.argv) < 2:
 	for line in sys.stdin:
 		insert(line.replace("\n", ""), cursor)
 else:
 	for inputfile in sys.argv[1:]:
 		insert(inputfile, cursor)
-
-cursor.execute("COMMIT TRANSACTION")
-
 dbcon.close()


--- a/create.sql
+++ b/create.sql
@@ -1,15 +1,15 @@
-- Create a table. And an external content fts5 table to index it.
-CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer, content text);
-CREATE VIRTUAL TABLE file_fts USING fts5(content, content='file', content_rowid='id');
+CREATE TABLE file(id INTEGER PRIMARY KEY, path varchar(4096) UNIQUE, mtime integer);
+CREATE TABLE content(id INTEGER PRIMARY KEY, fileid INTEGER REFERENCES file (id) ON DELETE CASCADE, page integer, content text);

-- Triggers to keep the FTS index up to date.
-CREATE TRIGGER file_ai AFTER INSERT ON file BEGIN
-  INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content);
+CREATE VIRTUAL TABLE content_fts USING fts5(content, content='content', content_rowid='id');
+
+CREATE TRIGGER contents_ai AFTER INSERT ON content BEGIN
+  INSERT INTO content_fts(rowid, content) VALUES (new.id, new.content);
 END;
-CREATE TRIGGER file_ad AFTER DELETE ON file BEGIN
-  INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content);
+CREATE TRIGGER contents_ad AFTER DELETE ON content BEGIN
+  INSERT INTO content_fts(content_fts, rowid, content) VALUES('delete', old.id, old.content);
 END;
-CREATE TRIGGER file_au AFTER UPDATE ON file BEGIN
-  INSERT INTO file_fts(file_fts, rowid, content) VALUES('delete', old.id, old.content);
-  INSERT INTO file_fts(rowid, content) VALUES (new.id, new.content);
+CREATE TRIGGER contents_au AFTER UPDATE ON content BEGIN
+  INSERT INTO content_fts(content_fts, rowid, content) VALUES('delete', old.id, old.content);
+  INSERT INTO content_fts(rowid, content) VALUES (new.id, new.content);
 END;
--- a/6
+++ b/6
@@ -10,9 +10,9 @@ if len(sys.argv) < 2:
 	print("Error: Missing search")
 	
 search=sys.argv[1:]
-
-for row in cursor.execute("SELECT file.path FROM file INNER JOIN file_fts ON file.id = file_fts.ROWID WHERE file_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
-	print(row[0])
+#TODO: machien parseable
+for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
+	print("File:", row[0], "Page: ", row[1])
 dbcon.close()