diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt index c82b0b217..cf9d4ff88 100755 --- a/libnd4j/CMakeLists.txt +++ b/libnd4j/CMakeLists.txt @@ -5,7 +5,7 @@ option(NATIVE "Optimize for build machine (might not work on others)" OFF) set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) #ensure we create lib files set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) - +option(CHECK_VECTORIZATION "checks for vectorization" OFF) option(BUILD_TESTS "Build tests" OFF) option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF) set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE) diff --git a/libnd4j/README.md b/libnd4j/README.md index 9cea1b597..ec17c6227 100644 --- a/libnd4j/README.md +++ b/libnd4j/README.md @@ -17,8 +17,11 @@ There's few additional arguments for `buildnativeoperations.sh` script you could -b release OR -b debug // enables/desables debug builds. release is considered by default -j XX // this argument defines how many threads will be used to binaries on your box. i.e. -j 8 -cc XX// CUDA-only argument, builds only binaries for target GPU architecture. use this for fast builds + --check-vectorization auto-vectorization report for developers. (Currently, only GCC is supported) ``` +[More about AutoVectorization report](auto_vectorization/AutoVectorization.md) + You can find the compute capability for your card [on the NVIDIA website here](https://developer.nvidia.com/cuda-gpus). For example, a GTX 1080 has compute capability 6.1, for which you would use ```-cc 61``` (note no decimal point). diff --git a/libnd4j/auto_vectorization/AutoVectorization.md b/libnd4j/auto_vectorization/AutoVectorization.md new file mode 100644 index 000000000..61b98febe --- /dev/null +++ b/libnd4j/auto_vectorization/AutoVectorization.md @@ -0,0 +1,49 @@ +# Auto-vectorization Report + +This report tool is used to get a human-friendly compiler output of the auto-vectorization process. It is intended for developers to help them to investigate the obstacles that compiler faced during auto-vectorization. + +## Usage +```--check-vectorization``` option should be added to the **release** build to be able to get the auto-vectorization report +```./buildnativeoperations.sh -a native -j 28 --check-vectorization``` +it will output ```vecmiss.html``` inside blasbuild/cpu folder. + +## Report Format +Each filename contains info about optimization attempts for the source code lines. +Each line number is also expandable (⇲) and contains distinct failure notes. +It is possible to click on the line number to see source code + +| file name | total successful attempts | total failed attempts | ⇲ | +|---|---|---|--| +| line number | successful attempts | failed attempts | ⇲ | +|- failure reasons | +| line number | successful attempts | failed attempts |⇲ | + +##### Requirements +- GCC (Currently, only GCC is supported) +- python3 + +### Detailed report with `-fsave-optimization-record` option: +If you want to get more detailed information (for now it reports the functions of failures) you should use new version of the toolchain (GCC > 9). As the new version of GCC compilers have `-fsave-optimization-record` option. +`buildnativeoperations.sh` using CMake will detect it and switch to the more detailed version. +Please, note that this option is still experimental and so the compiler can fail to output some json.gz file with error. +On that case try to exclude those files from the build. +And also the internal structure of the `-fsave-optimization-record` json.gz can be changed in future. + +It outputs two files **vecmiss_fsave.html** and **vecmiss_fsave.html.js**. So to see report details you need to enable javascript on browser if it was disabled. + +##### Requirements for the Detailed report +- GCC version > 9 +- python3 +- Cython (python3) +- json (python3) +- gzip (python3) +- c++filt + +Internally, we are using Cython to speed up json.gz file processing (bigGzipJson.pyx). Because json.gz files can take big memory in raw when loaded in whole. + +If you want to use bigGzipJson outside `buildnativeoperations.sh` and CMake then you should compile it manually using this command in auto_vectorization folder: +`python3 cython_setup.py build_ext --inplace` + +json.gz files could be processed outside of `buildnativeoperations.sh`. +You need to call `python3 auto_vect.py --fsave` inside base source folder and where json.gz files exist. + diff --git a/libnd4j/auto_vectorization/auto_vect.py b/libnd4j/auto_vectorization/auto_vect.py new file mode 100644 index 000000000..f98dc7422 --- /dev/null +++ b/libnd4j/auto_vectorization/auto_vect.py @@ -0,0 +1,546 @@ +''' +@author : Abdelrauf rauf@konduit.ai +''' +import re +import sys +import os +import subprocess +import fnmatch +import json +import gzip +try: + from bigGzipJson import json_gzip_extract_objects +except ImportError: + pass +from pathlib import Path +from multiprocessing import Pool, Manager ,cpu_count +import traceback +import html + +mtch = re.compile(r"[^/]*([^:]+)\:(\d+)\:(\d+)\:(.*)") +replace_msg = re.compile(r"(\d+)?\.?(\d+)?_?\d+\.?(\d+)?") +progress_msg = re.compile(r"\s{0,4}\[\s{0,2}\d+\%\]") +file_dir_strip = str(Path(os.getcwd())) +pp_index = file_dir_strip.rfind("libnd4j") +if pp_index>=0: + file_dir_strip =file_dir_strip[:pp_index+len("libnd4j")] +BASE_URL = "https://github.com/eclipse/deeplearning4j/tree/master/libnd4j/" +if BASE_URL.endswith("/")==False: + BASE_URL = BASE_URL + "/" +#print(file_dir_strip) +class info: + def __repr__(self): + return str(self.__dict__) + +FSAVE_IGNORE_EXTERNALS = True + +def get_cxx_filt_result(strx): + if len(strx)<1: + return "" + res = subprocess.Popen(["c++filt","-i", strx], stdout=subprocess.PIPE).communicate()[0] + res =res.decode('utf-8') + #replace some long names to reduce size + res = res.replace("unsigned long long", "uLL") + res = res.replace("unsigned long int","uL") + res = res.replace("unsigned long", "uL") + res = res.replace("unsigned int", "ui") + res = res.replace("unsigned char", "uchar") + res = res.replace("unsigned short", "ushort") + res = res.replace("long long", "LL") + res = res.replace(", ",",") + return res.strip() + + +def internal_glob(dir, match): + listx = [] + for root, dirnames, filenames in os.walk(dir): + for filename in fnmatch.filter(filenames, match): + listx.append(os.path.join(root, filename)) + return listx + +def get_obj_json_gz(filename): + with gzip.GzipFile(filename, 'r') as f: + return json.loads(f.read().decode('utf-8'))[-1] + + + +def get_msg(msg): + msg = msg.lower().strip() + if "note: not vectorized:" in msg: + msg = replace_msg.sub("_numb",msg.replace("note: not vectorized:","")) + return( 0, 1, msg.strip()) + elif "loop vectorized" in msg: + return (1, 0, None) + # elif msg.startswith("missed")==False: + # msg = replace_msg.sub("_numb",msg) + # return( 0, 0, msg.strip()) + return None + + + + +class File_Info: + ''' + Holds information about vectorized and miss vectorized lines for one file + ''' + + def __init__(self): + self.infos = {} + self.total_opted =0 + self.total_missed = 0 + self.external = False + + + def add_line(self, line_pos): + if line_pos not in self.infos: + v = info() + v.optimized = 0 + v.missed = 0 + v.miss_details = set() + self.infos[line_pos] = v + return v + else: + return self.infos[line_pos] + + + def add_line_fsave(self, line_pos): + if line_pos not in self.infos: + v = info() + v.optimized = 0 + v.missed = 0 + v.miss_details2 = dict() + self.infos[line_pos] = v + return v + else: + return self.infos[line_pos] + + + + def add_fsave(self, line_pos,success, msg, function ,inline_fns=''): + v = self.add_line_fsave(line_pos) + if success and "loop vectorized" in msg: + v.optimized +=1 + self.total_opted +=1 + elif success==False and "not vectorized:" in msg: + #reduce this msg + msg = msg.replace("not vectorized:","") + v.missed +=1 + self.total_missed +=1 + msg = sys.intern(msg) + if msg in v.miss_details2: + ls = v.miss_details2.get(msg) + ls.add(function) + else: + ls =set() + v.miss_details2[msg]=ls + ls.add(function) + return self + + def add(self, line_pos, msg_x): + v = self.add_line(line_pos) + if msg_x is not None: + v.optimized += msg_x[0] + v.missed += msg_x[1] + self.total_opted += msg_x[0] + self.total_missed += msg_x[1] + if msg_x[2] is not None: + v.miss_details.add(msg_x[2]) + return self + + + def __repr__(self): + return str(self.__dict__) + + + + +def process_gzip_json_mp(args): + process_gzip_json_new(*args) + +def process_gzip_json_new(json_gz_fname,list_Queue): + gz_name = Path(json_gz_fname).stem + #print("::--open and process {0}".format(gz_name)) + queue_count = len(list_Queue) + #print(queue_count) + q = list_Queue[0] + old_fname = '' + total_c = 0 + for x in json_gzip_extract_objects(json_gz_fname,'message','vectorized'): + external_source = True + if len(x['message'])>0 and 'location' in x: + line = int(x['location']['line']) + file_name = x['location']['file'].strip() + if file_dir_strip in file_name: + file_name = file_name.replace(file_dir_strip,'./') + external_source = False + msg = x['message'][0] + success = x['kind'] == 'success' + func = '' if 'function' not in x else x['function'] + + if file_name!=old_fname: + #send our info to the right consumer + queue_ind = hash(file_name) % queue_count + #print("quen index {0}".format(queue_ind)) + q =list_Queue[queue_ind] + old_fname = file_name + total_c +=1 + #print("pp {0} {1}".format(q,(file_name,line,success, msg, func,external_source ))) + if FSAVE_IGNORE_EXTERNALS==True and external_source == True: + continue + q.put((file_name,line,success, msg, func,external_source )) + print("::finished {0:60s} :{1:8d}".format(gz_name,total_c)) + +def consume_processed_mp(args): + return consume_processed_new(*args) + + + +def consume_processed_new(list_Queue , c_index): + + info_ = dict() + func_list = dict() + last_func_index = 0 + q = list_Queue[c_index] + print("::consumer {0}".format(c_index)) + total_c = 0 + r_c = 0 + while True: + #print("try to get new from {0}".format(index)) + obj = q.get() + #print("cc {0} {1}".format(q,obj)) + if obj==None: + break #we received the end + file_name,line,success, msg, func, external_source = obj + try: + #get function index + func_index = -1 + if func in func_list: + func_index = func_list[func] + else: + func_list[func] = last_func_index + func_index = last_func_index + last_func_index +=1 + + if file_name in info_: + info_[file_name].add_fsave(line, success, msg, func_index) + else: + info_[file_name] = File_Info().add_fsave(line, success, msg, func_index) + info_[file_name].external = external_source + total_c +=1 + if total_c - r_c >10000: + r_c = total_c + print("::consumer {0:2d} :{1:10d}".format(c_index,total_c)) + except Exception as e: + print(traceback.format_exc()) + break + + print("::consumer {0:2d} :{1:10d}".format(c_index,total_c)) + #write to temp file + wr_fname= "vecmiss_fsave{0}.html".format(str(c_index) if len(list_Queue)>1 else '') + print("generate report for consumer {0} {1}".format(c_index,len(info_))) + try: + uniq_ind = str(c_index)+'_' if len(list_Queue)>1 else '' + generate_report(wr_fname,info_ ,only_body = False, unique_id_prefix = uniq_ind,fsave_format = True, function_list= func_list) + print(" consumer {0} saved output into {1}".format(c_index,wr_fname)) + except Exception as e: + print(traceback.format_exc()) + + + +def obtain_info_from(input_): + info_ = dict() + for line in input_: + x = mtch.match(line) + external_source = True + if x: + file_name =x.group(1).strip() + if file_dir_strip in file_name: + file_name = file_name.replace(file_dir_strip,'') + external_source = False + line_number = int(x.group(2)) + msg = x.group(4).lower() + msg = msg.replace(file_dir_strip,'./') + msg_x = get_msg(msg) + if msg_x is None: + continue + if file_name in info_: + #ignore col_number + info_[file_name].add(line_number,msg_x) + else: + #print("{0} {1}".format(file_name,external_source)) + info_[file_name] = File_Info().add(line_number,msg_x) + info_[file_name].external = external_source + elif progress_msg.match(line): + #actually we redirect only, stderr so this should not happen + print("__"+line.strip()) + elif "error" in line or "Error" in line: + print("****"+line.strip()) + return info_ + + + +def custom_style(fsave): + st = '''''' + +def header(fsave=False): + strx ='\n\n\n\nAuto-Vectorization\n' + strx +=''.format(BASE_URL) + strx +=custom_style(fsave) + strx +='\n\n\n' + return strx + +def footer(): + return '\n' + + +def get_compressed_indices(set_a): + a_len = len(set_a) + if a_len<=1: + if a_len<1: + return '' + return str(set_a)[1:-1] + #we sorted and only saved difference + # 1,14,15,19 --> 1,13,1,4 10bytes=>8bytes + list_sorted = sorted(list(set_a)) + last = list_sorted[0] + str_x = str(list_sorted[0]) + for i in range(1,a_len): + str_x += ','+str(list_sorted[i]-last) + last = list_sorted[i] + return str_x + + + + + +def get_content(k, v, unique_id_prefix = '', fsave_format=False): + inner_str='' + content = '' + inc_id = 0 + for fk,fv in sorted(v.infos.items()): + if fsave_format==True: + inner_str+='
{0}
{1}
{2}