save each page of a pdf file
This commit is contained in:
parent
58c7a52585
commit
f535372be7
46
addindex
46
addindex
@ -36,15 +36,35 @@ def striptags(content):
|
|||||||
|
|
||||||
|
|
||||||
def strip_irrelevant(content):
|
def strip_irrelevant(content):
|
||||||
result = content.replace("\n", " ").replace("\t", " ").replace("\f", "")
|
result = content.replace("\n", " ").replace("\t", " ")
|
||||||
result = re.sub(' +', ' ', result)
|
result = re.sub(' +', ' ', result)
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
|
def pdf_pagecount(path):
|
||||||
|
cmd = "pdfinfo " + path + " | grep Pages | awk '{print $2}'"
|
||||||
|
stdout,stderr = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
||||||
|
result = int(stdout)
|
||||||
|
return result
|
||||||
def process_pdf(path):
|
def process_pdf(path):
|
||||||
args=["pdftotext", path , "-"]
|
result = list()
|
||||||
|
args=["pdftotext", path, "-"]
|
||||||
stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
||||||
result = strip_irrelevant(stdout.decode('utf-8'))
|
content = strip_irrelevant(stdout.decode('utf-8'))
|
||||||
return singlepagelist(result)
|
#it is faster to do it like this than to call pdfottext for each page
|
||||||
|
splitted = content.split("\f")
|
||||||
|
count=1
|
||||||
|
for page in splitted:
|
||||||
|
data = pagedata()
|
||||||
|
data.page = count
|
||||||
|
data.content = page
|
||||||
|
result.append(data)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
#everything = pagedata()
|
||||||
|
#everything.page = 0
|
||||||
|
#everything.content = content.replace("\f", "")
|
||||||
|
#result.append(everything)
|
||||||
|
return result
|
||||||
|
|
||||||
def process_odt(path):
|
def process_odt(path):
|
||||||
fd = zipfile.ZipFile(path)
|
fd = zipfile.ZipFile(path)
|
||||||
@ -52,11 +72,7 @@ def process_odt(path):
|
|||||||
fd.close()
|
fd.close()
|
||||||
return singlepagelist(striptags(content))
|
return singlepagelist(striptags(content))
|
||||||
|
|
||||||
def process_striptags(path):
|
def readtext(path):
|
||||||
content = process_text(path)
|
|
||||||
return singlepagelist(striptags(content))
|
|
||||||
|
|
||||||
def process_text(path):
|
|
||||||
fd = open(path, "rb")
|
fd = open(path, "rb")
|
||||||
content = fd.read()
|
content = fd.read()
|
||||||
fd.close()
|
fd.close()
|
||||||
@ -74,8 +90,14 @@ def process_text(path):
|
|||||||
except:
|
except:
|
||||||
print("FAILE DECODING: " + path)
|
print("FAILE DECODING: " + path)
|
||||||
result = ""
|
result = ""
|
||||||
|
return result
|
||||||
return singlepagelist(result)
|
|
||||||
|
def process_striptags(path):
|
||||||
|
content = readtext(path)
|
||||||
|
return singlepagelist(striptags(content))
|
||||||
|
|
||||||
|
def process_text(path):
|
||||||
|
return singlepagelist(readtext(path))
|
||||||
|
|
||||||
def process_nothing(path):
|
def process_nothing(path):
|
||||||
return list()
|
return list()
|
||||||
|
@ -11,7 +11,7 @@ if len(sys.argv) < 2:
|
|||||||
|
|
||||||
search=sys.argv[1:]
|
search=sys.argv[1:]
|
||||||
#TODO: machien parseable
|
#TODO: machien parseable
|
||||||
for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
|
for row in cursor.execute("SELECT file.path, content.page FROM file INNER JOIN content ON file.id = content.fileid INNER JOIN content_fts ON content.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)):
|
||||||
print("File:", row[0], "Page: ", row[1])
|
print("File:", row[0], "Page: ", row[1])
|
||||||
dbcon.close()
|
dbcon.close()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user