save each page of a pdf file

2018-08-09 19:30:28 +02:00
parent 58c7a52585
commit f535372be7
2 changed files with 35 additions and 13 deletions
--- a/46
+++ b/46
@@ -36,15 +36,35 @@ def striptags(content):
 	

 def strip_irrelevant(content):
-	result = content.replace("\n", " ").replace("\t", " ").replace("\f", "")
+	result = content.replace("\n", " ").replace("\t", " ")
 	result = re.sub(' +', ' ', result)
 	return result;
-	
+
+def pdf_pagecount(path):
+	cmd = "pdfinfo " + path + " | grep Pages | awk '{print $2}'"
+	stdout,stderr = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+	result = int(stdout)
+	return result
 def process_pdf(path):
-	args=["pdftotext", path , "-"]
+	result = list()
+	args=["pdftotext", path, "-"]
 	stdout,stderr =  subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
-	result = strip_irrelevant(stdout.decode('utf-8'))
-	return singlepagelist(result)
+	content = strip_irrelevant(stdout.decode('utf-8'))
+	#it is faster to do it like this than to call pdfottext for each page
+	splitted = content.split("\f")
+	count=1
+	for page in splitted: 
+		data = pagedata()
+		data.page = count
+		data.content = page
+		result.append(data)
+		count += 1
+	
+	#everything = pagedata()
+	#everything.page = 0
+	#everything.content = content.replace("\f", "")
+	#result.append(everything)
+	return result
 	
 def process_odt(path):
 	fd = zipfile.ZipFile(path)
@@ -52,11 +72,7 @@ def process_odt(path):
 	fd.close()
 	return singlepagelist(striptags(content))
 	
-def process_striptags(path):
-	content = process_text(path)
-	return singlepagelist(striptags(content))
-	
-def process_text(path):
+def readtext(path):
 	fd = open(path, "rb")
 	content = fd.read()
 	fd.close()
@@ -74,8 +90,14 @@ def process_text(path):
 		except:
 			print("FAILE DECODING: " + path)
 			result = ""
-		
-	return singlepagelist(result)
+	return result
+	
+def process_striptags(path):
+	content = readtext(path)
+	return singlepagelist(striptags(content))
+	
+def process_text(path):
+	return singlepagelist(readtext(path))
 	
 def process_nothing(path):
 	return list()
--- a/2
+++ b/2
@@ -11,7 +11,7 @@ if len(sys.argv) < 2:
 	
 search=sys.argv[1:]
 #TODO: machien parseable
-for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
+for row in cursor.execute("SELECT file.path, content.page FROM file INNER JOIN content ON file.id = content.fileid INNER JOIN content_fts ON content.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
 	print("File:", row[0], "Page: ", row[1])
 dbcon.close()