1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| import re import os import uuid import time
def writeTable(path, fileName): arr = [] f1 = open(path, encoding='utf-8') f2 = open(path, encoding='utf-8') line = f1.read() for l in f2: arr.append(l)
# appName appName = re.findall(r"\.appName\(\"(.+)\"\)", line) if(len(appName) == 0): return
# 输入表 inputTable = re.findall(r"[fF][rR][oO][mM]\s+(.+)", line) input = [] for i in range(len(inputTable)): # print(inputTable[i]) inputTable[i] = inputTable[i][0:inputTable[i].find('"', 1)] find_ = inputTable[i].find(' ', 1) + 1 if find_ == 0: if inputTable[i].find("$") == -1 and inputTable[i].find("tmp") == -1 and inputTable[i].find( "temp") == -1: input.append(inputTable[i] + "," + appName[0] + ",INPUT") else: if not str(inputTable[i][0:inputTable[i].find(' ', 1)]).__eq__('"'): if inputTable[i].find("$") == -1 and inputTable[i].find("tmp") == -1 and inputTable[i].find( "temp") == -1: input.append(inputTable[i][0:inputTable[i].find(' ', 1)] + "," + appName[0] + ",INPUT") input = list(set(input))
# 输出表 out = [] outputTable = re.findall( r"(saveToEs\s*\(\s*|valueOf\s*\(|OUTPUT_TABLE\s*,\s*|saveAsTable\s*\(\s*|insertInto\s*\(\s*)(.+)\)", line) if (line.__contains__("syncEs")): tmp = re.findall(r"syncEs\s*?[\s\S]*?\)", line) outTmp = list(set(tmp)) for j in range(len(outTmp)): out_ = outTmp[j].find('"') if out_ != -1: cin = outTmp[j].split('"')[1] outputTable.append(["syncEs", '"' + cin + '"']) for i in range(len(outputTable)): find_ = outputTable[i][1].find('"') + 1 if find_ == 0: regex = outputTable[i][1] + r"\s*=\s*\"(.+)\"" if len(re.findall(regex, line)) != 0: out.append(appName[0] + "," + re.findall(regex, line)[0] + ",OUTPUT") else: if (outputTable[i][1] != "snapTable") and (outputTable[i][1].find("index") == -1): out.append(appName[0] + "," + outputTable[i][1][find_:outputTable[i][1].find('"', 2)] + ",OUTPUT") out = list(set(out))
with open('E:\\relationship-analysis\\file\\' + fileName + '.csv', mode='a') as relation: for i in range(len(input)): relation.write(input[i]) relation.write("\n") for i in range(len(out)): relation.write(out[i]) relation.write("\n")
if __name__ == '__main__': target_dir = "E:\\spark2\\" fileName = str("relation-" + time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) for root, dirs, files in os.walk(target_dir): for name in files: if name.endswith(".scala"): print(os.path.join(root, name)) writeTable(os.path.join(root, name), fileName)
|