save each page of a pdf file

This commit is contained in:
Albert S. 2018-08-09 19:30:28 +02:00
parent 58c7a52585
commit f535372be7
2 changed files with 35 additions and 13 deletions

View File

@ -36,15 +36,35 @@ def striptags(content):
def strip_irrelevant(content): def strip_irrelevant(content):
result = content.replace("\n", " ").replace("\t", " ").replace("\f", "") result = content.replace("\n", " ").replace("\t", " ")
result = re.sub(' +', ' ', result) result = re.sub(' +', ' ', result)
return result; return result;
def pdf_pagecount(path):
cmd = "pdfinfo " + path + " | grep Pages | awk '{print $2}'"
stdout,stderr = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
result = int(stdout)
return result
def process_pdf(path): def process_pdf(path):
args=["pdftotext", path , "-"] result = list()
args=["pdftotext", path, "-"]
stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
result = strip_irrelevant(stdout.decode('utf-8')) content = strip_irrelevant(stdout.decode('utf-8'))
return singlepagelist(result) #it is faster to do it like this than to call pdfottext for each page
splitted = content.split("\f")
count=1
for page in splitted:
data = pagedata()
data.page = count
data.content = page
result.append(data)
count += 1
#everything = pagedata()
#everything.page = 0
#everything.content = content.replace("\f", "")
#result.append(everything)
return result
def process_odt(path): def process_odt(path):
fd = zipfile.ZipFile(path) fd = zipfile.ZipFile(path)
@ -52,11 +72,7 @@ def process_odt(path):
fd.close() fd.close()
return singlepagelist(striptags(content)) return singlepagelist(striptags(content))
def process_striptags(path): def readtext(path):
content = process_text(path)
return singlepagelist(striptags(content))
def process_text(path):
fd = open(path, "rb") fd = open(path, "rb")
content = fd.read() content = fd.read()
fd.close() fd.close()
@ -74,8 +90,14 @@ def process_text(path):
except: except:
print("FAILE DECODING: " + path) print("FAILE DECODING: " + path)
result = "" result = ""
return result
return singlepagelist(result) def process_striptags(path):
content = readtext(path)
return singlepagelist(striptags(content))
def process_text(path):
return singlepagelist(readtext(path))
def process_nothing(path): def process_nothing(path):
return list() return list()

View File

@ -11,7 +11,7 @@ if len(sys.argv) < 2:
search=sys.argv[1:] search=sys.argv[1:]
#TODO: machien parseable #TODO: machien parseable
for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)): for row in cursor.execute("SELECT file.path, content.page FROM file INNER JOIN content ON file.id = content.fileid INNER JOIN content_fts ON content.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
print("File:", row[0], "Page: ", row[1]) print("File:", row[0], "Page: ", row[1])
dbcon.close() dbcon.close()