docker-diskover/root/defaults/diskover.cfg

; diskover config file
; if you make any changes, restart worker bots so they get the new config

[excludes]
; directory names and absolute paths you want to exclude from crawl, case-sensitive, can include wildcards (.* or backup* or /dir/dirname* or *tmp or *tmp* etc)
dirs = .*,.snapshot,.Snapshot,.zfs
; files you want to exclude from crawl, case-sensitive, can include wildcards (.*, *.doc or NULLEXT for files with no extension)
files = .*,Thumbs.db,.DS_Store,._.DS_Store,.localized,desktop.ini

[includes]
; directory names and absolute paths you want to include (whitelist), case-sensitive, you don't need to whitelist rootdir (-d rootdir)
;dirs = .recycle
; files you want to include (whitelist), case-sensitive
;files =

[ownersgroups]
; control how owner (username) and group fields are stored for file and directory docs
; store uid and gid's instead of trying to get owner and group names (default is False)
;uidgidonly = False
; owner/group names contain domain name set to True (default is False)
;domain = False
; character separator used on cifs/nfs mounts to separte user/group and domain name, usually \\ or @
;domainsep = \\
; if domain name comes first before character separator, set this to True, otherwise False (default is True)
;domainfirst = True
; when indexing owner and group fields, keep the domain name (default is False)
;keepdomain = False

[autotag]
; pattern dictionaries for diskover bots to use when auto-tagging, values are case-sensitive, can include wildcard for ext, name or path (tmp* or TMP* or *tmp or *TMP* etc)
; can also specify file name with json, example files = autotag.files.json
;files = [{"name": [], "name_exclude": [], "ext": ["tmp*", "TMP*", "temp*", "TEMP*", "cache*", "CACHE*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}]
;dirs = [{"name": ["*tmp*", "*TMP*", "*temp*", "*TEMP*", "*Temp*", "*cache*", "*CACHE*", "*Cache*"], "name_exclude": ["*templates*", "*Templates*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}]

[storagecost]
; storage cost per GB (default is 0.03 cents per GB)
costpergb = 0.03
; use decimal base 10 (1000) or binary base 2 (1024) for GB size (default is 2, set to 2 or 10)
base = 2
; pattern dictionaries for diskover bots to use when determing cost per GB (overrides above)
; can also specify file name with json, example paths = storagecost.paths.json
;paths = [{"path": ["*/fastdiskpath1/*", "*/Fastdiskpath2/*"], "path_exclude": [], "costpergb": 0.05}, {"path": ["*/slowdiskpath1/*", "*/Slowdiskpath2/*"], "path_exclude": [], "costpergb": 0.02}]
;times = [{"mtime": 180, "atime": 0, "ctime": 180, "costpergb": 0.02}]
; deciding factor if a match is in both paths and times, can be path or time
;priority = path

[elasticsearch]
; uncomment the below three lines if you are using AWS ES
;aws = False
;host = search-diskover-es-cluster-eg3yztrvzb6qucroyyjk2vokza.ap-northeast-1.es.amazonaws.com
;port = 443
; below two lines are for local ES, comment out if you are using AWS ES
host = {{ES_HOST}}
port = {{ES_PORT}}
; uncomment the below two lines if you installed X-Pack, for http-auth
user = {{ES_USER}}
password = {{ES_PASS}}
; index name for ES, cli arg overwrites this
indexname = diskover-index
; timeout for connection to ES (default is 10)
timeout = 30
; number of connections kept open to ES when crawling (default is 10)
maxsize = 20
; max retries for ES operations (default is 0)
maxretries = 10
; wait for at least yellow status before bulk uploading (default is False), set to True if you want to wait
wait = False
; chunk size for ES bulk operations (default is 500)
chunksize = 1000
; number of shards for index (default is 5)
shards = 1
; number of replicas for index (default is 1)
replicas = 0
; the below settings are to optimize ES for crawling
; index refresh interval (default is 1s), set to -1 to disable refresh during crawl (fastest performance but no index searches), after crawl is set back to 1s
indexrefresh = 30s
; disable replicas during crawl - set to True to turn off replicas or False to keep on (default False), after crawl is set back to replicas value above
disablereplicas = True
; transaction log flush threshold size (default 512mb)
translogsize = 1gb
; search scroll size (default 100 docs)
scrollsize = 1000

[redis]
host = {{REDIS_HOST}}
port = {{REDIS_PORT}}
;password =
; cache directory times in Redis
; used for -I index2 when comparing directory times to get metadata from index2 instead of off disk
; set to True to cache dir times or False to turn off (default False)
cachedirtimes = False
; how long in seconds directory keys lives in Redis (default 1 day)
dirtimesttl = 604800
; database to use (default is 0)
db = 0
; rq job default time out in sec (default 180)
timeout = 3600
; rq default ttl for key/results (default 500)
ttl = 500
; rq queue names to use (default is diskover, diskover_crawl, diskover_calcdir)
queue = diskover
queuecrawl = diskover_crawl
queuecalcdir = diskover_calcdir

[adaptivebatch]
; adaptive batch settings when using -a (intelligent crawling)
; batchsize (numbers of dirs) to start at
startsize = 50
; maximum size of batch
maxsize = 500
; when adjusting batch size use this for +/- (increases when queue is 0, decreases when > 0)
stepsize = 10
; max number of files in batch, above is ignored if file limit reached (default 50000)
maxfiles = 50000

[paths]
; used by diskover socket server
; path to diskover.py (default is ./diskover.py)
diskoverpath = ./diskover.py
; path to python executable (default is python)
pythonpath = python

[socketlistener]
; hostname and port (TCP) for diskover socket server for remote commands
host = 0.0.0.0
port = 9999
; max connections for diskover socket server
maxconnections = 5
; port (TCP) for diskover socket server for messages from diskover treewalk client
twcport = 9998

[dupescheck]
; read size (bytes) for md5 sum check (how many bytes to read in at a time when md5 checking, default 64 KB)
readsize = 65536
; max size (bytes) of files to check (files larger than this will be skipped, default 1 GB)
maxsize = 1073741824
; bytes to check at start and end of file before doing md5 sum check (set large enough to account for file header info, default is 64)
checkbytes = 64
; try to restore times (mtime/atime) for files that get opened by byte check and md5
; set to True or False, default False (useful for cifs which does not work with noatime mount option)
restoretimes = False
; number of threads for calculating md5 checksum of files
threads = 8

[crawlbot]
; continuous scanner when running with --crawlbot
; time to sleep (seconds) between checking for directory changes
sleeptime = 0.1
; number of threads for checking directories, setting this to num of cores x2 is a good starting point
threads = 8
; how often in seconds to get new directory list with updated times from ES (default 3600)
dirlisttime = 3600

[gource]
; should be set to same in diskover-gource.sh
maxfilelag = 0.1

[crawlapi]
; crawl api url endpoint
;url = http://localhost:8080/api
; optional api login
;user = admin
;pass = admin
; number of items per page for each directory list request
;pagesize = 1000