From 9e02827d1591f951b61f9832eac6739b19ad47c8 Mon Sep 17 00:00:00 2001 From: mwiegand Date: Wed, 16 Jun 2021 19:43:27 +0200 Subject: [PATCH] wip --- bundles/dovecot/files/decode2text.sh | 105 +++++++++++++++++++++++++++ bundles/dovecot/files/dovecot.conf | 22 ++++++ bundles/dovecot/items.py | 8 ++ bundles/dovecot/metadata.py | 10 ++- 4 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 bundles/dovecot/files/decode2text.sh diff --git a/bundles/dovecot/files/decode2text.sh b/bundles/dovecot/files/decode2text.sh new file mode 100644 index 0000000..151fb7c --- /dev/null +++ b/bundles/dovecot/files/decode2text.sh @@ -0,0 +1,105 @@ +#!/bin/sh + +# Example attachment decoder script. The attachment comes from stdin, and +# the script is expected to output UTF-8 data to stdout. (If the output isn't +# UTF-8, everything except valid UTF-8 sequences are dropped from it.) + +# The attachment decoding is enabled by setting: +# +# plugin { +# fts_decoder = decode2text +# } +# service decode2text { +# executable = script /usr/local/libexec/dovecot/decode2text.sh +# user = dovecot +# unix_listener decode2text { +# mode = 0666 +# } +# } + +libexec_dir=`dirname $0` +content_type=$1 + +# The second parameter is the format's filename extension, which is used when +# found from a filename of application/octet-stream. You can also add more +# extensions by giving more parameters. +formats='application/pdf pdf +application/x-pdf pdf +application/msword doc +application/mspowerpoint ppt +application/vnd.ms-powerpoint ppt +application/ms-excel xls +application/x-msexcel xls +application/vnd.ms-excel xls +application/vnd.openxmlformats-officedocument.wordprocessingml.document docx +application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx +application/vnd.openxmlformats-officedocument.presentationml.presentation pptx +application/vnd.oasis.opendocument.text odt +application/vnd.oasis.opendocument.spreadsheet ods +application/vnd.oasis.opendocument.presentation odp +' + +if [ "$content_type" = "" ]; then + echo "$formats" + exit 0 +fi + +fmt=`echo "$formats" | grep -w "^$content_type" | cut -d ' ' -f 2` +if [ "$fmt" = "" ]; then + echo "Content-Type: $content_type not supported" >&2 + exit 1 +fi + +# most decoders can't handle stdin directly, so write the attachment +# to a temp file +path=`mktemp` +trap "rm -f $path" 0 1 2 3 14 15 +cat > $path + +xmlunzip() { + name=$1 + + tempdir=`mktemp -d` + if [ "$tempdir" = "" ]; then + exit 1 + fi + trap "rm -rf $path $tempdir" 0 1 2 3 14 15 + cd $tempdir || exit 1 + unzip -q "$path" 2>/dev/null || exit 0 + find . -name "$name" -print0 | xargs -0 cat | + $libexec_dir/xml2text +} + +wait_timeout() { + childpid=$! + trap "kill -9 $childpid; rm -f $path" 1 2 3 14 15 + wait $childpid +} + +LANG=en_US.UTF-8 +export LANG +if [ $fmt = "pdf" ]; then + /usr/bin/pdftotext $path - 2>/dev/null& + wait_timeout 2>/dev/null +elif [ $fmt = "doc" ]; then + (/usr/bin/catdoc $path; true) 2>/dev/null& + wait_timeout 2>/dev/null +elif [ $fmt = "ppt" ]; then + (/usr/bin/catppt $path; true) 2>/dev/null& + wait_timeout 2>/dev/null +elif [ $fmt = "xls" ]; then + (/usr/bin/xls2csv $path; true) 2>/dev/null& + wait_timeout 2>/dev/null +elif [ $fmt = "odt" -o $fmt = "ods" -o $fmt = "odp" ]; then + xmlunzip "content.xml" +elif [ $fmt = "docx" ]; then + xmlunzip "document.xml" +elif [ $fmt = "xlsx" ]; then + xmlunzip "sharedStrings.xml" +elif [ $fmt = "pptx" ]; then + xmlunzip "slide*.xml" +else + echo "Buggy decoder script: $fmt not handled" >&2 + exit 1 +fi +exit 0 diff --git a/bundles/dovecot/files/dovecot.conf b/bundles/dovecot/files/dovecot.conf index 994a363..83f7b5a 100644 --- a/bundles/dovecot/files/dovecot.conf +++ b/bundles/dovecot/files/dovecot.conf @@ -52,3 +52,25 @@ service lmtp { group = postfix } } + +mail_plugins = fts fts_xapian + +# fulltext search +plugin { + fts = xapian + fts_xapian = partial=3 full=20 verbose=0 + fts_autoindex = yes + fts_enforced = yes + # Index attachements + fts_decoder = decode2text +} +service indexer-worker { + vsz_limit = 1G +} +service decode2text { + executable = script /usr/local/libexec/dovecot/decode2text.sh + user = dovecot + unix_listener decode2text { + mode = 0666 + } +} diff --git a/bundles/dovecot/items.py b/bundles/dovecot/items.py index 86c1105..67fde1d 100644 --- a/bundles/dovecot/items.py +++ b/bundles/dovecot/items.py @@ -73,3 +73,11 @@ svc_systemd = { }, }, } + +# fulltext search + +directories['/usr/local/libexec/dovecot'] = {} +files['/usr/local/libexec/dovecot/decode2text.sh'] = { + 'owner': 'dovecot', + 'mode': '500', +} diff --git a/bundles/dovecot/metadata.py b/bundles/dovecot/metadata.py index feb27ef..cd8ac4d 100644 --- a/bundles/dovecot/metadata.py +++ b/bundles/dovecot/metadata.py @@ -1,11 +1,15 @@ defaults = { 'apt': { 'packages': { - 'dovecot-imapd': {}, - 'dovecot-pgsql': {}, - 'dovecot-lmtpd': {}, + 'dovecot-imapd': {}, + 'dovecot-pgsql': {}, + 'dovecot-lmtpd': {}, # 'dovecot-sieve': {}, # 'dovecot-managesieved': {}, + # fulltext search + 'dovecot-fts-xapian': {}, # buster-backports + 'poppler-utils': {}, # pdftotext + 'catdoc': {}, # catdoc, catppt, xls2csv }, }, 'letsencrypt': {