pyHarvest TXT and other text formats

Submitted by admin on Tue, 12/22/2015 - 21:24

This module reads a given TXT or other text based file and searches for patterns in it based on patterns defined in patternmatch module. Number of encodings to be tried is a parameter (e.g. UTF-8, UTF-16, Windows-1252 etc.). It is currently able to execute a test against a file using all potential encodings available in the operating system for the python environment.

I am not a programmer in any ways, so the scripts below are pretty ugly, but it serves the purpose they meant to do for me. Please feel free to use or modify them as needed. Any feedback also appreciated (info@ domain of this website).

GitHub resource containing source code:

https://github.com/bl305/pyHarvest

Program code:

#!/usr/bin/env python
# coding=utf-8
 
import chardet
import codecs
import re
import os
from pyHarvest_xmod_patternmatch import *
 
 
mypath=r'c:\test\FileEncoding\normal.txt'
 
#set verbosity
#-1 - no messages
#0 - tuple of results
#1 - summary information
#2 - basic information, positive info
#3 - detailed information, positive, negative
#4 - go crazy about it...
myverbosity=0
 
#max number of encoders to try - which seem to be a good one for reading
mymaxencode=0
 
#not used....
def to_unicode_or_bust(obj, encoding='utf-8'):
	if isinstance(obj, basestring):
		if not isinstance(obj, unicode):
			obj = unicode(obj, encoding)
	return obj
 
#GOOD:
def txt_get_all_encodings():
	import pkgutil
	import encodings
 
	false_positives = set(["aliases"])
 
	found = set(name for imp, name, ispkg in pkgutil.iter_modules(encodings.__path__) if not ispkg)
	found.difference_update(false_positives)
 
	#some default limited lists:
	#encodings = ['utf-8', 'windows-1250', 'windows-1252']
	#encodings = ['big5', 'big5hkscs', 'cp950', 'gb2312', 'gbk',
	#           'gb18030', 'hz', 'iso2022_jp_2', 'utf_16',
	#          'utf_16_be', 'utf_16_le', 'utf_8', 'utf_8_sig']
	return found 
 
#not really good
def txt_get_all_encodings_aliases():
	from encodings.aliases import aliases
	return aliases.keys()
 
#not really good
def txt_get_all_encodings_values():
	from encodings.aliases import aliases
	return aliases.values()
 
def txt_full_search_tuple_brief(path,maxencode=10,verbosity=0):
	encodings=txt_get_all_encodings()
	goodencodings=()
	matches=()
	maxencodecounter=0
	if verbosity>0:
		print "[+] Reading file:",path
	for e in encodings:
		if (maxencodecounter<maxencode) or (maxencode==0):
			if verbosity>3:
				print "[+] Current Encoding:%d, Max Encoding:%d"%(maxencodecounter,maxencode)
			try:
				if verbosity>1:
					print "[+] Testing encoding:",e
				fh = codecs.open(path, 'r', encoding=e)
				line=""
 
				try:
					line=fh.read()
					goodencodings+=(e,)
					maxencodecounter+=1
					#print "Encode:",e
					#match works regardless of unicode printing...doesn't throw an error that bad...but not good for big and simple unicode
					mymatch=find_pattern(line)
					if mymatch:
						if verbosity>1:
							print "[+] Match found!!",e,mymatch[0]
						mymatch+=(e,)
						matches+=(mymatch)
						pass
#					try:
#						print line.encode('utf-8')
#						print "Error:",e
#					except:
#						if verbosity>3:
#							print "[-] Cannot print using encoding",e
#						pass
				except:
					pass
 
				fh.close()
			except UnicodeDecodeError:
				pass
			except UnicodeError:
				pass
			else:
				pass#try all options
				#break #or stop at first, that might be good, might not...
	if matches:
		tresult=matches
		tresult+=(os.path.dirname(path),os.path.basename(path)),
		return tresult
 
def txt_full_search_tuple(apath,maxencode=10,verbosity=0):
	encodings=txt_get_all_encodings()
	goodencodings=()
	matches=()
	mypath=os.path.dirname(apath)
	myfilename=os.path.basename(apath)
	maxencodecounter=0
	if verbosity>0:
		print "[+] Reading file:",apath
	for e in encodings:
		if (maxencodecounter<maxencode) or (maxencode==0):
			if verbosity>3:
				print "[+] Current Encoding:%d, Max Encoding:%d"%(maxencodecounter,maxencode)
			try:
				if verbosity>1:
					print "[+] Testing encoding:",e
				fh = codecs.open(apath, 'r', encoding=e)
				line=""
 
				try:
					line=fh.read()
					goodencodings+=(e,)
					maxencodecounter+=1
					#print "Encode:",e
					#match works regardless of unicode printing...doesn't throw an error that bad...but not good for big and simple unicode
					mymatch=find_pattern(line)
					if mymatch:
						for i1 in range(len(mymatch)):
							#print "[+] Match found!!",e,mymatch[i1]
							if verbosity>1:
									print "[+] Match found!!",e,mymatch[i1]
							matches+=(unicode(mymatch[i1][0]),unicode(mymatch[i1][1]),unicode(mymatch[i1][2]),unicode(e),unicode(mypath),unicode(myfilename)),
							pass
		#					try:
		#						print line.encode('utf-8')
		#						print "JO:",e
		#					except:
		#						if verbosity>3:
		#							print "[-] Cannot print using encoding",e
		#						pass
				except:
					pass
 
				fh.close()
			except UnicodeDecodeError:
				pass
			except UnicodeError:
				pass
			else:
				pass#try all options
				#break #or stop at first, that might be good, might not...
	return matches
 
def txt_full_search_list(apath,amaxencode=10,averbosity=0):
	myinput=txt_full_search_tuple(apath,maxencode=amaxencode,verbosity=averbosity)
#	print myinput
	result=""
	for i1 in range(len(myinput)):
		#print myinput[i1]
		result+="%s\t%s\%s\t%s\t%s\t%s\n"%(unicode(myinput[i1][0]),unicode(myinput[i1][1]),unicode(myinput[i1][2]),unicode(myinput[i1][3]),unicode(myinput[i1][4]),unicode(myinput[i1][5]))
	return result
 
 
#if myverbosity>=0:
#	print txt_full_search_tuple_brief(mypath,maxencode=mymaxencode,verbosity=myverbosity)

Main menu

Main menu

Howto

pyHarvest TXT and other text formats

Main menu

You are here

Main menu

Howto

pyHarvest TXT and other text formats