diff --git a/addindex b/addindex index 8d0b917..3fb297a 100755 --- a/addindex +++ b/addindex @@ -36,15 +36,35 @@ def striptags(content): def strip_irrelevant(content): - result = content.replace("\n", " ").replace("\t", " ").replace("\f", "") + result = content.replace("\n", " ").replace("\t", " ") result = re.sub(' +', ' ', result) return result; - + +def pdf_pagecount(path): + cmd = "pdfinfo " + path + " | grep Pages | awk '{print $2}'" + stdout,stderr = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() + result = int(stdout) + return result def process_pdf(path): - args=["pdftotext", path , "-"] + result = list() + args=["pdftotext", path, "-"] stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() - result = strip_irrelevant(stdout.decode('utf-8')) - return singlepagelist(result) + content = strip_irrelevant(stdout.decode('utf-8')) + #it is faster to do it like this than to call pdfottext for each page + splitted = content.split("\f") + count=1 + for page in splitted: + data = pagedata() + data.page = count + data.content = page + result.append(data) + count += 1 + + #everything = pagedata() + #everything.page = 0 + #everything.content = content.replace("\f", "") + #result.append(everything) + return result def process_odt(path): fd = zipfile.ZipFile(path) @@ -52,11 +72,7 @@ def process_odt(path): fd.close() return singlepagelist(striptags(content)) -def process_striptags(path): - content = process_text(path) - return singlepagelist(striptags(content)) - -def process_text(path): +def readtext(path): fd = open(path, "rb") content = fd.read() fd.close() @@ -74,8 +90,14 @@ def process_text(path): except: print("FAILE DECODING: " + path) result = "" - - return singlepagelist(result) + return result + +def process_striptags(path): + content = readtext(path) + return singlepagelist(striptags(content)) + +def process_text(path): + return singlepagelist(readtext(path)) def process_nothing(path): return list() diff --git a/searchindex b/searchindex index 258a243..35e4da8 100755 --- a/searchindex +++ b/searchindex @@ -11,7 +11,7 @@ if len(sys.argv) < 2: search=sys.argv[1:] #TODO: machien parseable -for row in cursor.execute("SELECT file.path, contents.page FROM file INNER JOIN contents ON file.id = contents.fileid INNER JOIN content_fts ON contents.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)): +for row in cursor.execute("SELECT file.path, content.page FROM file INNER JOIN content ON file.id = content.fileid INNER JOIN content_fts ON content.id = content_fts.ROWID WHERE content_fts.content MATCH ? ORDER By file.mtime ASC", (search)): print("File:", row[0], "Page: ", row[1]) dbcon.close()