; diskover config file ; if you make any changes, restart worker bots so they get the new config [excludes] ; directory names and absolute paths you want to exclude from crawl, case-sensitive, can include wildcards (.* or backup* or /dir/dirname* or *tmp or *tmp* etc) dirs = .*,.snapshot,.Snapshot,.zfs ; files you want to exclude from crawl, case-sensitive, can include wildcards (.*, *.doc or NULLEXT for files with no extension) files = .*,Thumbs.db,.DS_Store,._.DS_Store,.localized,desktop.ini [includes] ; directory names and absolute paths you want to include (whitelist), case-sensitive, you don't need to whitelist rootdir (-d rootdir) ;dirs = .recycle ; files you want to include (whitelist), case-sensitive ;files = [ownersgroups] ; control how owner (username) and group fields are stored for file and directory docs ; store uid and gid's instead of trying to get owner and group names (default is False) ;uidgidonly = False ; owner/group names contain domain name set to True (default is False) ;domain = False ; character separator used on cifs/nfs mounts to separte user/group and domain name, usually \\ or @ ;domainsep = \\ ; if domain name comes first before character separator, set this to True, otherwise False (default is True) ;domainfirst = True ; when indexing owner and group fields, keep the domain name (default is False) ;keepdomain = False [autotag] ; pattern dictionaries for diskover bots to use when auto-tagging, values are case-sensitive, can include wildcard for ext, name or path (tmp* or TMP* or *tmp or *TMP* etc) ; can also specify file name with json, example files = autotag.files.json ;files = [{"name": [], "name_exclude": [], "ext": ["tmp*", "TMP*", "temp*", "TEMP*", "cache*", "CACHE*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}] ;dirs = [{"name": ["*tmp*", "*TMP*", "*temp*", "*TEMP*", "*Temp*", "*cache*", "*CACHE*", "*Cache*"], "name_exclude": ["*templates*", "*Templates*"], "path": ["*/Application Support/*", "*/Containers/*"], "path_exclude": [], "mtime": 90, "atime": 0, "ctime": 90, "tag": "delete", "tag_custom": "autotag"}] [storagecost] ; storage cost per GB (default is 0.03 cents per GB) costpergb = 0.03 ; use decimal base 10 (1000) or binary base 2 (1024) for GB size (default is 2, set to 2 or 10) base = 2 ; pattern dictionaries for diskover bots to use when determing cost per GB (overrides above) ; can also specify file name with json, example paths = storagecost.paths.json ;paths = [{"path": ["*/fastdiskpath1/*", "*/Fastdiskpath2/*"], "path_exclude": [], "costpergb": 0.05}, {"path": ["*/slowdiskpath1/*", "*/Slowdiskpath2/*"], "path_exclude": [], "costpergb": 0.02}] ;times = [{"mtime": 180, "atime": 0, "ctime": 180, "costpergb": 0.02}] ; deciding factor if a match is in both paths and times, can be path or time ;priority = path [elasticsearch] ; uncomment the below three lines if you are using AWS ES ;aws = False ;host = search-diskover-es-cluster-eg3yztrvzb6qucroyyjk2vokza.ap-northeast-1.es.amazonaws.com ;port = 443 ; below two lines are for local ES, comment out if you are using AWS ES host = {{ES_HOST}} port = {{ES_PORT}} ; uncomment the below two lines if you installed X-Pack, for http-auth user = {{ES_USER}} password = {{ES_PASS}} ; index name for ES, cli arg overwrites this indexname = diskover-index ; timeout for connection to ES (default is 10) timeout = 30 ; number of connections kept open to ES when crawling (default is 10) maxsize = 20 ; max retries for ES operations (default is 0) maxretries = 10 ; wait for at least yellow status before bulk uploading (default is False), set to True if you want to wait wait = False ; chunk size for ES bulk operations (default is 500) chunksize = 1000 ; number of shards for index (default is 5) shards = 1 ; number of replicas for index (default is 1) replicas = 0 ; the below settings are to optimize ES for crawling ; index refresh interval (default is 1s), set to -1 to disable refresh during crawl (fastest performance but no index searches), after crawl is set back to 1s indexrefresh = 30s ; disable replicas during crawl - set to True to turn off replicas or False to keep on (default False), after crawl is set back to replicas value above disablereplicas = True ; transaction log flush threshold size (default 512mb) translogsize = 1gb ; search scroll size (default 100 docs) scrollsize = 1000 [redis] host = {{REDIS_HOST}} port = {{REDIS_PORT}} ;password = ; cache directory times in Redis ; used for -I index2 when comparing directory times to get metadata from index2 instead of off disk ; set to True to cache dir times or False to turn off (default False) cachedirtimes = False ; how long in seconds directory keys lives in Redis (default 1 day) dirtimesttl = 604800 ; database to use (default is 0) db = 0 ; rq job default time out in sec (default 180) timeout = 3600 ; rq default ttl for key/results (default 500) ttl = 500 ; rq queue names to use (default is diskover, diskover_crawl, diskover_calcdir) queue = diskover queuecrawl = diskover_crawl queuecalcdir = diskover_calcdir [adaptivebatch] ; adaptive batch settings when using -a (intelligent crawling) ; batchsize (numbers of dirs) to start at startsize = 50 ; maximum size of batch maxsize = 500 ; when adjusting batch size use this for +/- (increases when queue is 0, decreases when > 0) stepsize = 10 ; max number of files in batch, above is ignored if file limit reached (default 50000) maxfiles = 50000 [paths] ; used by diskover socket server ; path to diskover.py (default is ./diskover.py) diskoverpath = ./diskover.py ; path to python executable (default is python) pythonpath = python [socketlistener] ; hostname and port (TCP) for diskover socket server for remote commands host = 0.0.0.0 port = 9999 ; max connections for diskover socket server maxconnections = 5 ; port (TCP) for diskover socket server for messages from diskover treewalk client twcport = 9998 [dupescheck] ; read size (bytes) for md5 sum check (how many bytes to read in at a time when md5 checking, default 64 KB) readsize = 65536 ; max size (bytes) of files to check (files larger than this will be skipped, default 1 GB) maxsize = 1073741824 ; bytes to check at start and end of file before doing md5 sum check (set large enough to account for file header info, default is 64) checkbytes = 64 ; try to restore times (mtime/atime) for files that get opened by byte check and md5 ; set to True or False, default False (useful for cifs which does not work with noatime mount option) restoretimes = False ; number of threads for calculating md5 checksum of files threads = 8 [crawlbot] ; continuous scanner when running with --crawlbot ; time to sleep (seconds) between checking for directory changes sleeptime = 0.1 ; number of threads for checking directories, setting this to num of cores x2 is a good starting point threads = 8 ; how often in seconds to get new directory list with updated times from ES (default 3600) dirlisttime = 3600 [gource] ; should be set to same in diskover-gource.sh maxfilelag = 0.1 [crawlapi] ; crawl api url endpoint ;url = http://localhost:8080/api ; optional api login ;user = admin ;pass = admin ; number of items per page for each directory list request ;pagesize = 1000