This module is responsible for doing the pattern match for the other modules. The patterns are customizable regex expressions.
I am not a programmer in any ways, so the scripts below are pretty ugly, but it serves the purpose they meant to do for me. Please feel free to use or modify them as needed. Any feedback also appreciated (info@ domain of this website).
GitHub resource containing source code:
https://github.com/bl305/pyHarvest
Program code:
#!/usr/bin/env python # coding=utf-8 import re mydata='''This is a simple text that could be matched 4444 4444 4444 4444 aaa:password bbb Password''' mypatterns=( ##search simple string, ignore case: ##search_string(r'(?i)password',sheet) #CREDIT CARD DATA ('CreditCard_16numbers',ur'\d{16}'), ('CreditCard_ALL',ur'(?:\d[ -]*?){13,16}'), ('CreditCard_VISA',ur'^4\d{3}([\ \-]?)\d{4}\1\d{4}\1\d{4}'), ('CreditCard_MASTER',ur'^5[1-5]\d{2}([\ \-]?)\d{4}\1\d{4}\1\d{4}'), ('CreditCard_DISCOVER',ur'^6(?:011|22(?:1(?=[\ \-]?(?:2[6-9]|[3-9]))|[2-8]|9(?=[\ \-]?(?:[01]|2[0-5])))|4[4-9]\d|5\d\d)([\ \-]?)\d{4}\1\d{4}\1\d{4}'), ('CreditCard_JAPAN',ur'^35(?:2[89]|[3-8]\d)([\ \-]?)\d{4}\1\d{4}\1\d{4}'), ('CreditCard_AMEX',ur'(?<!\-|\.)3[47]\d\d([\ \-]?)(?<!\d\ \d{4}\ )(?!(\d)\2{5}|123456|234567|345678)\d{6}(?!\ \d{5}\ \d)\1(?!(\d)\3{4}|12345|56789)\d{5}(?!\-)(?!\.\d)'), ('CreditCard_AMEX_notdash',ur'^3[47]\d\d([\ \-]?)\d{6}\1\d{5}'), ('CreditCard_CHINA_UNION',ur'^62[0-5]\d{13,16}'), ('CreditCard_MAESTRO',ur'^(?:5[0678]\d\d|6304|6390|67\d\d)\d{8,15}'), ('CreditCard_VISA_MASTER_AMEX_DISCOVER',ur'(?:3[47]\d{2}([\ \-]?)\d{6}\1\d|(?:(?:4\d|5[1-5]|65)\d{2}|6011)([\ \-]?)\d{4}\2\d{4}\2)\d{4}'), #MD5 ('PWD_MD5',ur'(\b[A-Fa-f0-9]{32}\b)'), #SHA1 NOT WORKING ('PWD_SHA1',ur'\b([a-f0-9]{40})\b'), #simple strings ignore case password #('pwd_password',r'(?i)password'), #PASSWORD STRING case insensitive ('pwd_password',ur'(?i)password'), #('pwd_spanish',ur'(?i)'), ('pwd_pwd',ur'(?i)pwd'), ('pwd_passw',ur'(?i)passw'), #USERNAME STRING case insensitive ('username_username',ur'(?i)username'), ('username_spanish',ur'(?i)usuario'), ) def find_pattern (data,patterns=mypatterns,aggressive=0): matches=() for i in range(len(patterns)): if aggressive==1: #print "[+] Starting aggressive search" creditcard=re.search(ur'^creditcard',patterns[i][0].lower()) try: if creditcard: #print "[+] Cleaning data for credit card checks",data workdata1=data.replace(" ","") workdata2=workdata1.replace("-","") workdata3=workdata2.replace(",","") workdata4=workdata3.replace(".","") tmpdata=workdata4 regex = re.compile(patterns[i][1]) it = re.finditer(regex, tmpdata) #print "[+] Searching for pattern:\n%s in \n%s"%(patterns[i][1],data) try: if it: for mymatch in it: #print "[+] Aggressive search:",data #print "[+] Match found!! String:%s Pattern:%s SearchValue:%s"%(mymatch.group(),patterns[i][0],patterns[i][1]) matches+=(mymatch.group(),patterns[i][0],patterns[i][1]), except Exception,e: print e pass except: pass else: regex = re.compile(patterns[i][1]) it = re.finditer(regex, data) #print "[+] Searching for pattern:\n%s in \n%s"%(patterns[i][1],data) try: if it: for mymatch in it: #print "[+] Normal search:",data #print "[+] Match found!! String:%s Pattern:%s SearchValue:%s"%(mymatch.group(),patterns[i][0],patterns[i][1]) matches+=(mymatch.group(),patterns[i][0],patterns[i][1]), except Exception,e: print e pass return matches #matchtuple=find_pattern(mydata,mypatterns,0) #print matchtuple