2018-01-03 09:40:13 +01:00
|
|
|
#!/usr/bin/python3
|
|
|
|
import sqlite3
|
|
|
|
import os.path
|
|
|
|
import sys
|
|
|
|
import subprocess
|
|
|
|
import zipfile
|
|
|
|
import xml.etree.ElementTree
|
|
|
|
import re
|
|
|
|
import chardet
|
|
|
|
import config
|
2018-08-09 22:47:33 +02:00
|
|
|
from multiprocessing import Pool
|
|
|
|
|
2018-01-03 09:40:13 +01:00
|
|
|
|
2018-08-09 16:37:30 +02:00
|
|
|
|
|
|
|
class pagedata:
|
|
|
|
page = 0
|
|
|
|
content = ""
|
|
|
|
|
|
|
|
|
|
|
|
def singlepagelist(content):
|
|
|
|
result = pagedata()
|
|
|
|
result.content = content
|
|
|
|
result.page = 0
|
|
|
|
l = list();
|
|
|
|
l.append(result)
|
|
|
|
return l
|
|
|
|
|
2018-01-03 09:40:13 +01:00
|
|
|
def striptags(content):
|
|
|
|
result = ""
|
|
|
|
try:
|
|
|
|
result = ''.join(xml.etree.ElementTree.fromstring(content).itertext())
|
|
|
|
except:
|
|
|
|
#TODO: test<br>test2 will make it testtest2 not test test2
|
|
|
|
result = re.sub('<[^>]*>', '', content)
|
|
|
|
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def strip_irrelevant(content):
|
2018-08-09 19:30:28 +02:00
|
|
|
result = content.replace("\n", " ").replace("\t", " ")
|
2018-01-03 09:40:13 +01:00
|
|
|
result = re.sub(' +', ' ', result)
|
|
|
|
return result;
|
2018-08-09 19:30:28 +02:00
|
|
|
|
|
|
|
def pdf_pagecount(path):
|
|
|
|
cmd = "pdfinfo " + path + " | grep Pages | awk '{print $2}'"
|
|
|
|
stdout,stderr = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
|
|
|
result = int(stdout)
|
|
|
|
return result
|
2018-01-03 09:40:13 +01:00
|
|
|
def process_pdf(path):
|
2018-08-09 19:30:28 +02:00
|
|
|
result = list()
|
|
|
|
args=["pdftotext", path, "-"]
|
2018-01-03 09:40:13 +01:00
|
|
|
stdout,stderr = subprocess.Popen(args,stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
|
2018-08-09 19:30:28 +02:00
|
|
|
content = strip_irrelevant(stdout.decode('utf-8'))
|
|
|
|
#it is faster to do it like this than to call pdfottext for each page
|
|
|
|
splitted = content.split("\f")
|
|
|
|
count=1
|
|
|
|
for page in splitted:
|
|
|
|
data = pagedata()
|
|
|
|
data.page = count
|
|
|
|
data.content = page
|
|
|
|
result.append(data)
|
|
|
|
count += 1
|
|
|
|
|
2018-08-09 21:03:49 +02:00
|
|
|
#TODO: current hack, so we can fts search several words over the whole document
|
|
|
|
#this of course uses more space, but in the end that's not a big problem
|
|
|
|
#Nevertheless, this remains a hack
|
|
|
|
everything = pagedata()
|
|
|
|
everything.page = 0
|
|
|
|
everything.content = content.replace("\f", "")
|
|
|
|
result.append(everything)
|
2018-08-09 19:30:28 +02:00
|
|
|
return result
|
2018-01-03 09:40:13 +01:00
|
|
|
|
|
|
|
def process_odt(path):
|
|
|
|
fd = zipfile.ZipFile(path)
|
|
|
|
content = fd.read("content.xml").decode("utf-8")
|
|
|
|
fd.close()
|
2018-08-09 16:37:30 +02:00
|
|
|
return singlepagelist(striptags(content))
|
2018-01-03 09:40:13 +01:00
|
|
|
|
2018-08-09 19:30:28 +02:00
|
|
|
def readtext(path):
|
2018-01-03 09:40:13 +01:00
|
|
|
fd = open(path, "rb")
|
|
|
|
content = fd.read()
|
|
|
|
fd.close()
|
|
|
|
|
|
|
|
result=""
|
|
|
|
try:
|
2018-08-09 16:37:30 +02:00
|
|
|
result = str(content.decode("utf-8"))
|
2018-01-03 09:40:13 +01:00
|
|
|
except:
|
2018-08-09 16:37:30 +02:00
|
|
|
try:
|
|
|
|
encoding = chardet.detect(content)["encoding"];
|
|
|
|
if encoding == None:
|
|
|
|
result = ""
|
|
|
|
else:
|
|
|
|
result = str(content.decode(encoding))
|
|
|
|
except:
|
|
|
|
print("FAILE DECODING: " + path)
|
|
|
|
result = ""
|
2018-08-09 19:30:28 +02:00
|
|
|
return result
|
|
|
|
|
|
|
|
def process_striptags(path):
|
|
|
|
content = readtext(path)
|
|
|
|
return singlepagelist(striptags(content))
|
|
|
|
|
|
|
|
def process_text(path):
|
|
|
|
return singlepagelist(readtext(path))
|
2018-01-03 09:40:13 +01:00
|
|
|
|
|
|
|
def process_nothing(path):
|
2018-08-09 16:37:30 +02:00
|
|
|
return list()
|
2018-01-03 09:40:13 +01:00
|
|
|
|
|
|
|
def exists(abspath, mtime):
|
|
|
|
cursor = dbcon.cursor()
|
|
|
|
cursor.execute("SELECT 1 FROM file WHERE path = ? AND mtime = ?" , (abspath, mtime))
|
|
|
|
result = cursor.fetchone()
|
|
|
|
if result != None and result[0] == 1:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
2018-08-09 22:51:21 +02:00
|
|
|
def insert(path):
|
2018-01-03 09:40:13 +01:00
|
|
|
print("processing", path)
|
|
|
|
abspath=os.path.abspath(path)
|
|
|
|
mtime = int(os.stat(abspath).st_mtime)
|
|
|
|
|
|
|
|
if exists(abspath, mtime):
|
|
|
|
print("Leaving alone " + abspath + " because it wasn't changed")
|
|
|
|
return
|
|
|
|
basename=os.path.basename(abspath)
|
|
|
|
ext = os.path.splitext(abspath)[1]
|
|
|
|
|
|
|
|
content=""
|
|
|
|
|
|
|
|
processor=process_nothing
|
|
|
|
if ext in preprocess:
|
|
|
|
processor=preprocess[ext]
|
2018-08-09 16:37:30 +02:00
|
|
|
pagedatalist = processor(abspath)
|
2018-01-03 09:40:13 +01:00
|
|
|
|
2018-08-09 22:47:33 +02:00
|
|
|
#TODO: assumes sqlitehas been built with thread safety (and it is the default)
|
|
|
|
cursor = dbcon.cursor()
|
2018-08-09 16:37:30 +02:00
|
|
|
cursor.execute("BEGIN TRANSACTION")
|
|
|
|
cursor.execute("DELETE FROM file WHERE path = ?", (abspath,))
|
|
|
|
cursor.execute("INSERT INTO file(path, mtime) VALUES(?, ?) ", (abspath, mtime))
|
|
|
|
fileid=cursor.lastrowid
|
|
|
|
for pagedata in pagedatalist:
|
|
|
|
cursor.execute("INSERT INTO content(fileid, page, content) VALUES(?, ?, ?)", (fileid, pagedata.page, pagedata.content))
|
|
|
|
cursor.execute("COMMIT TRANSACTION")
|
2018-01-03 09:40:13 +01:00
|
|
|
|
|
|
|
preprocess={".pdf":process_pdf, ".odt":process_odt, ".html":process_striptags, ".xml":process_nothing, ".txt":process_text,
|
|
|
|
".sql":process_text, ".c":process_text, ".cpp":process_text, ".js":process_text, ".java":process_text,
|
|
|
|
".py":process_text, '.md':process_text}
|
|
|
|
|
2018-08-09 22:47:33 +02:00
|
|
|
|
|
|
|
|
|
|
|
def yieldstdinfiles():
|
2018-01-03 09:40:13 +01:00
|
|
|
for line in sys.stdin:
|
2018-08-09 22:47:33 +02:00
|
|
|
yield line.replace("\n", "")
|
2018-08-09 22:51:21 +02:00
|
|
|
|
2018-08-09 22:47:33 +02:00
|
|
|
|
|
|
|
def init():
|
|
|
|
global dbcon
|
|
|
|
dbcon = sqlite3.connect(config.DBPATH, isolation_level=None)
|
|
|
|
|
|
|
|
|
|
|
|
dbcon = None
|
|
|
|
if __name__ == '__main__':
|
|
|
|
with Pool(processes=4,initializer=init) as pool:
|
|
|
|
|
|
|
|
if len(sys.argv) < 2:
|
2018-08-09 22:51:21 +02:00
|
|
|
pool.map(insert, yieldstdinfiles)
|
2018-08-09 22:47:33 +02:00
|
|
|
else:
|
2018-08-09 22:51:21 +02:00
|
|
|
pool.map(insert, sys.argv[1:])
|
2018-08-09 22:47:33 +02:00
|
|
|
|
|
|
|
|
2018-01-03 09:40:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|