pyHarvest FTP

This module crawls a given FTP server and provides the following options (parameter dependent):

  • start crawling from given directory
  • lists contents (file name, size, date, etc)
  • recursive or limited to current directory
  • downloads contents using filters on:
    • size limit
    • file name
    • directory name
    • hierarchy level/limit max depth

I am not a programmer in any ways, so the scripts below are pretty ugly, but it serves the purpose they meant to do for me. Please feel free to use or modify them as needed. Any feedback also appreciated (info@ domain of this website). 

GitHub resource containing source code:

https://github.com/bl305/pyHarvest

Program code:

#!/usr/bin/env python
# coding=utf-8
 
 
from ftplib import FTP
from datetime import datetime
import time
import re
import os
import errno
 
#templates:
#IIS MS-DOS default
#11-01-15  05:11AM       <DIR>          Level1
#10-30-15  12:38PM                   12 Level2.txt
#10-30-15  12:38PM       <DIR>          Level2_2
#10-30-15  12:44PM       <DIR>          x
#maybe:^(\d{2}-\d{2}-\d{2})\s+(\d{2}:\d{2}(A|P)M)\s+(<DIR>)*\s+(\d*)\s+(\S+)
 
#IIS-UNIX default
#'-rwxrwxrwx   1 owner    group              12 Oct 30 12:38 Level2.txt'
 
#FTP PARAMETERS
#	#domain name or server ip:
#	#ftp = FTP('ftp.fau.de')
#	FTPserver='ftp.au.debian.org'
#	FTPdir='/pub/linux/debian/doc/'
#
#	#myftpserver='192.168.178.182'
#	#myftpdir='/'
#	FTPusername="anonymous"
#	FTPpassword="anony@mo.us"
#
#	#limit to pull files smaller than this
#	#http://www.whatsabyte.com/P1/byteconverter.htm
#	FTPflimit_1K=1024
#	FTPflimit_1M=1048576
#	FTPflimit_1GB=1073741824
#	#flimit_max:9223372036854775807
#	FTPflimit=FTPflimit_1K*1
#	#output directpry path
#	#current dir:
#	#outdir=os.path.realpath('.')
#	FTPoutdir="./DataGathered/FTP"
#
#	FTPskiplist=('/dev','/bin','/opt','initrd.img','vmlinuz')
#
#	FTPdownloadfiles=0
#	FTPcreatedirs=0
 
#http://code.activestate.com/recipes/499334-remove-ftp-directory-walk-equivalent/
def parse_dir_line_linux(ftp,top,myskiplist1):
	##################################################################################################
	#'drwxr-xr-x   5 ftpfau   ftpfau       4096 Sep 12  2013 aptosid'
	#'drwxrwxr-x 180 ftpfau   ftpfau       4096 Oct 27 17:14 apache'
	#sline=" ".join(line.split())
	#ssline=sline.split(' ')
	if top in myskiplist1:
		print "[+] Skipping dir: ",top
		return (),(),()
	dirs, nondirs = [], []
	data=[]
	datalen=len(data)
	ftp.dir(data.append)
	if datalen<len(data):
		#print "[+] New content"
		for line in data:
			ssline = line.split(None, 8)
 
			if len(ssline) < 6:
				print >> sys.stderr, 'Warning: Error reading short line', line
				pass#continue
 
			# Get the filename.
			filename = ssline[-1].lstrip()
			# Get the link target, if the file is a symlink.
			extra = None
			i = filename.find(" -> ")
			if i >= 0:
				# ssline[0] had better start with 'l'...
				extra = filename[i+4:]
				filename = filename[:i]
			if filename in ('.', '..','0'):
				pass#continue
			mysomething=ssline[1]
			myuser=ssline[2]
			mygroup=ssline[3]
 
			# Get the file size.
			size = int(ssline[4])
 
			# Get the date.
			year = datetime.today().year
			_calmonths = dict( (x, i+1) for i, x in
				   enumerate(('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
							  'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec')) )
 
			month = _calmonths[ssline[5]]
			day = int(ssline[6])
			mo = re.match('(\d+):(\d+)', ssline[7])
			if mo:
				hour, min = map(int, mo.groups())
			else:
				mo = re.match('(\d\d\d\d)', ssline[7])
				if mo:
					year = int(mo.group(1))
					hour, min = 0, 0
				else:
					raise ValueError("Could not parse time/year in line: '%s'" % line)
			dt = datetime(year, month, day, hour, min)
			mtime = time.mktime(dt.timetuple())
			mytime=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(mtime))
			# Get the type and permission.
			permission = ssline[0]
			filetype=""
			if permission[0] == '-':
				filetype="f"
			elif permission[0] == 'd':
				filetype="d"
			elif permission[0] == 'l':
				filetype="l"
			else:
				filetype="X"
 
			entry = (filename, permission, mysomething, myuser, mygroup, size, mytime, top, filetype, extra)
			if permission[0] == 'd':
				dirs.append(entry)
			else:
				nondirs.append(entry)
 
			#print "%s;%s;%s;%s;%s;%s;%s"%(permission,ssline[1],myuser,mygroup,size,dt,filename)
		#print "Dirs: ",dirs
		#print "Non-Dirs: ",nondirs
		return entry,dirs,nondirs
 
	else:
		print "[-] Empty dir: ",top
		return (),(),()
 
def ftpwalk_linux(ftp, top, myskiplist, topdown=True, onerror=None):
	"""
	Generator yields tuples of (root, dirs, nondirs).
	"""
	# Make the FTP object's current directory to the top dir.
	try:
		ftp.cwd(top)
	except:
		pass
 
	# We may not have read permission for top, in which case we can't
	# get a list of the files the directory contains.  os.path.walk
	# always suppressed the exception then, rather than blow up for a
	# minor reason when (say) a thousand readable directories are still
	# left to visit.  That logic is copied here.
	try:
		tmp,dirs,nondirs = parse_dir_line_linux(ftp,top,myskiplist)
	except os.error, err:
		if onerror is not None:
			onerror(err)
		return
 
	if topdown:
		yield top, dirs, nondirs
	if tmp <> ():
 
		for entry in dirs:
			dname = entry[0]
			#OR path = posixpath.join(top, dname)
			#path=os.path.join(top, dname)
			path1="%s/%s"%(top, dname)
			path=path1.replace('//','/')
			if (entry[-1] is None): # not a link
				for x in ftpwalk_linux(ftp, path, myskiplist, topdown, onerror):
					yield x
 
	if not topdown:
		yield top, dirs, nondirs
 
def get_file(ftp,ftpserver,path,filename,aoutdir,averbosity=0):
	abspath1=aoutdir+"/"+ftpserver+"/"+path+"/"
	abspath=abspath1.replace('//','/')
	abspathfile1=abspath+"/"+filename
	abspathfile2=abspathfile1.replace('//','/')
	abspathfile=abspathfile2.replace('//','/')
	try:
		if not os.path.exists(abspath):
			os.makedirs(abspath)
			print "[+] Creating directory %s" % abspath
	except OSError as exception:
		if exception.errno != errno.EEXIST:
			raise
		else:
			#print "[-] Directory already exist %s" % apath
			pass
	try:
		fhandle = open(abspathfile, 'wb')
		try:
			getthis1=path + "/" + filename
			getthis=getthis1.replace('//','/')
			if averbosity>0:
				print "[+] Getting file \"%s\" to \"%s\""%(getthis,abspathfile)
			ftp.retrbinary("RETR " + getthis ,fhandle.write)
			return abspathfile
		except:
			print "[-] Error getting file \"%s\" to \"%s\""%(getthis,abspathfile)
			fhandle.close()
	except:
		print "Local file coulnd't be generated...file naming restriction???",abspathfile
 
 
def prepare_dir(autdir,ftpserver,path,adir,averbosity=0):
	abspath1=aoutdir+"/"+ftpserver+"/"+path+"/"+adir
	abspath=abspath1.replace('//','/')
	try:
		os.makedirs(abspath)
		if averbosity>0:
			print "[+] Creating directory %s" % path
	except OSError as exception:
		if exception.errno != errno.EEXIST:
			raise
		else:
			if averbosity>3:
				print "[-] Directory already exist? %s" % path
			pass
	else:
		print "[-] Directory create error ... perhaps:)"
		pass
	return abspath1
 
def ftp_main(aftpserver,aftpdir,ausername,apassword,aflimit,aoutdir,askiplist,adownloadfiles=0,acreatedirs=0,averbosity=0):
 
	if averbosity>0:
		FTPflimit_1K=1024
		FTPflimit_1M=1048576
		FTPflimit_1GB=1073741824
		print "Size limit to pull per file KB:%d MB:%d GB:%d"%(aflimit/FTPflimit_1K,aflimit/FTPflimit_1M,aflimit/FTPflimit_1GB)
 
	ftp = FTP(aftpserver)
	ftp.login(user=ausername, passwd = apassword)
	ftp.cwd(aftpdir)
 
	allitems=()
	for root, dirs, files in ftpwalk_linux(ftp,aftpdir,askiplist):
		for name in dirs:
			if acreatedirs==1:
				prepare_dir(aoutdir,aftpserver,name[7],name[0],averbosity)
			if averbosity>2:
				print "Dir: %s %s %s %s %s %s %s %s %s %s %s"%(aftpserver,name[0],name[1],name[2],name[3],name[4],name[5],name[6],name[7],name[8],aoutdir)
			allitems+=(aftpserver,name[7],name[0],name[8],name[5],'',name[1],name[5],'','','',name[6],name[3],name[4],name[2],'',aoutdir),
			pass
 
		for name in files:
			if ((name[5]<aflimit) and (name[0] not in askiplist) and (name[8]<>'l')):
				if averbosity>2:
					print "Dir: %s %s %s %s %s %s %s %s %s %s %s"%(aftpserver,name[0],name[1],name[2],name[3],name[4],name[5],name[6],name[7],name[8],aoutdir)
				if adownloadfiles==1:
					get_file(ftp,aftpserver,name[7],name[0],aoutdir,averbosity)
				allitems+=(aftpserver,name[7],name[0],name[8],name[5],'',name[1],name[5],'','','',name[6],name[3],name[4],name[2],'',aoutdir),
				pass
			pass
	ftp.quit()
	return allitems