diff --git a/libnd4j/CMakeLists.txt b/libnd4j/CMakeLists.txt
index c82b0b217..cf9d4ff88 100755
--- a/libnd4j/CMakeLists.txt
+++ b/libnd4j/CMakeLists.txt
@@ -5,7 +5,7 @@ option(NATIVE "Optimize for build machine (might not work on others)" OFF)
set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
#ensure we create lib files
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
-
+option(CHECK_VECTORIZATION "checks for vectorization" OFF)
option(BUILD_TESTS "Build tests" OFF)
option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler" OFF)
set(FLATBUFFERS_BUILD_FLATC "OFF" CACHE STRING "Hack to disable flatc build" FORCE)
diff --git a/libnd4j/README.md b/libnd4j/README.md
index 9cea1b597..ec17c6227 100644
--- a/libnd4j/README.md
+++ b/libnd4j/README.md
@@ -17,8 +17,11 @@ There's few additional arguments for `buildnativeoperations.sh` script you could
-b release OR -b debug // enables/desables debug builds. release is considered by default
-j XX // this argument defines how many threads will be used to binaries on your box. i.e. -j 8
-cc XX// CUDA-only argument, builds only binaries for target GPU architecture. use this for fast builds
+ --check-vectorization auto-vectorization report for developers. (Currently, only GCC is supported)
```
+[More about AutoVectorization report](auto_vectorization/AutoVectorization.md)
+
You can find the compute capability for your card [on the NVIDIA website here](https://developer.nvidia.com/cuda-gpus).
For example, a GTX 1080 has compute capability 6.1, for which you would use ```-cc 61``` (note no decimal point).
diff --git a/libnd4j/auto_vectorization/AutoVectorization.md b/libnd4j/auto_vectorization/AutoVectorization.md
new file mode 100644
index 000000000..61b98febe
--- /dev/null
+++ b/libnd4j/auto_vectorization/AutoVectorization.md
@@ -0,0 +1,49 @@
+# Auto-vectorization Report
+
+This report tool is used to get a human-friendly compiler output of the auto-vectorization process. It is intended for developers to help them to investigate the obstacles that compiler faced during auto-vectorization.
+
+## Usage
+```--check-vectorization``` option should be added to the **release** build to be able to get the auto-vectorization report
+```./buildnativeoperations.sh -a native -j 28 --check-vectorization```
+it will output ```vecmiss.html``` inside blasbuild/cpu folder.
+
+## Report Format
+Each filename contains info about optimization attempts for the source code lines.
+Each line number is also expandable (⇲) and contains distinct failure notes.
+It is possible to click on the line number to see source code
+
+| file name | total successful attempts | total failed attempts | ⇲ |
+|---|---|---|--|
+| line number | successful attempts | failed attempts | ⇲ |
+|- failure reasons |
+| line number | successful attempts | failed attempts |⇲ |
+
+##### Requirements
+- GCC (Currently, only GCC is supported)
+- python3
+
+### Detailed report with `-fsave-optimization-record` option:
+If you want to get more detailed information (for now it reports the functions of failures) you should use new version of the toolchain (GCC > 9). As the new version of GCC compilers have `-fsave-optimization-record` option.
+`buildnativeoperations.sh` using CMake will detect it and switch to the more detailed version.
+Please, note that this option is still experimental and so the compiler can fail to output some json.gz file with error.
+On that case try to exclude those files from the build.
+And also the internal structure of the `-fsave-optimization-record` json.gz can be changed in future.
+
+It outputs two files **vecmiss_fsave.html** and **vecmiss_fsave.html.js**. So to see report details you need to enable javascript on browser if it was disabled.
+
+##### Requirements for the Detailed report
+- GCC version > 9
+- python3
+- Cython (python3)
+- json (python3)
+- gzip (python3)
+- c++filt
+
+Internally, we are using Cython to speed up json.gz file processing (bigGzipJson.pyx). Because json.gz files can take big memory in raw when loaded in whole.
+
+If you want to use bigGzipJson outside `buildnativeoperations.sh` and CMake then you should compile it manually using this command in auto_vectorization folder:
+`python3 cython_setup.py build_ext --inplace`
+
+json.gz files could be processed outside of `buildnativeoperations.sh`.
+You need to call `python3 auto_vect.py --fsave` inside base source folder and where json.gz files exist.
+
diff --git a/libnd4j/auto_vectorization/auto_vect.py b/libnd4j/auto_vectorization/auto_vect.py
new file mode 100644
index 000000000..f98dc7422
--- /dev/null
+++ b/libnd4j/auto_vectorization/auto_vect.py
@@ -0,0 +1,546 @@
+'''
+@author : Abdelrauf rauf@konduit.ai
+'''
+import re
+import sys
+import os
+import subprocess
+import fnmatch
+import json
+import gzip
+try:
+ from bigGzipJson import json_gzip_extract_objects
+except ImportError:
+ pass
+from pathlib import Path
+from multiprocessing import Pool, Manager ,cpu_count
+import traceback
+import html
+
+mtch = re.compile(r"[^/]*([^:]+)\:(\d+)\:(\d+)\:(.*)")
+replace_msg = re.compile(r"(\d+)?\.?(\d+)?_?\d+\.?(\d+)?")
+progress_msg = re.compile(r"\s{0,4}\[\s{0,2}\d+\%\]")
+file_dir_strip = str(Path(os.getcwd()))
+pp_index = file_dir_strip.rfind("libnd4j")
+if pp_index>=0:
+ file_dir_strip =file_dir_strip[:pp_index+len("libnd4j")]
+BASE_URL = "https://github.com/eclipse/deeplearning4j/tree/master/libnd4j/"
+if BASE_URL.endswith("/")==False:
+ BASE_URL = BASE_URL + "/"
+#print(file_dir_strip)
+class info:
+ def __repr__(self):
+ return str(self.__dict__)
+
+FSAVE_IGNORE_EXTERNALS = True
+
+def get_cxx_filt_result(strx):
+ if len(strx)<1:
+ return ""
+ res = subprocess.Popen(["c++filt","-i", strx], stdout=subprocess.PIPE).communicate()[0]
+ res =res.decode('utf-8')
+ #replace some long names to reduce size
+ res = res.replace("unsigned long long", "uLL")
+ res = res.replace("unsigned long int","uL")
+ res = res.replace("unsigned long", "uL")
+ res = res.replace("unsigned int", "ui")
+ res = res.replace("unsigned char", "uchar")
+ res = res.replace("unsigned short", "ushort")
+ res = res.replace("long long", "LL")
+ res = res.replace(", ",",")
+ return res.strip()
+
+
+def internal_glob(dir, match):
+ listx = []
+ for root, dirnames, filenames in os.walk(dir):
+ for filename in fnmatch.filter(filenames, match):
+ listx.append(os.path.join(root, filename))
+ return listx
+
+def get_obj_json_gz(filename):
+ with gzip.GzipFile(filename, 'r') as f:
+ return json.loads(f.read().decode('utf-8'))[-1]
+
+
+
+def get_msg(msg):
+ msg = msg.lower().strip()
+ if "note: not vectorized:" in msg:
+ msg = replace_msg.sub("_numb",msg.replace("note: not vectorized:",""))
+ return( 0, 1, msg.strip())
+ elif "loop vectorized" in msg:
+ return (1, 0, None)
+ # elif msg.startswith("missed")==False:
+ # msg = replace_msg.sub("_numb",msg)
+ # return( 0, 0, msg.strip())
+ return None
+
+
+
+
+class File_Info:
+ '''
+ Holds information about vectorized and miss vectorized lines for one file
+ '''
+
+ def __init__(self):
+ self.infos = {}
+ self.total_opted =0
+ self.total_missed = 0
+ self.external = False
+
+
+ def add_line(self, line_pos):
+ if line_pos not in self.infos:
+ v = info()
+ v.optimized = 0
+ v.missed = 0
+ v.miss_details = set()
+ self.infos[line_pos] = v
+ return v
+ else:
+ return self.infos[line_pos]
+
+
+ def add_line_fsave(self, line_pos):
+ if line_pos not in self.infos:
+ v = info()
+ v.optimized = 0
+ v.missed = 0
+ v.miss_details2 = dict()
+ self.infos[line_pos] = v
+ return v
+ else:
+ return self.infos[line_pos]
+
+
+
+ def add_fsave(self, line_pos,success, msg, function ,inline_fns=''):
+ v = self.add_line_fsave(line_pos)
+ if success and "loop vectorized" in msg:
+ v.optimized +=1
+ self.total_opted +=1
+ elif success==False and "not vectorized:" in msg:
+ #reduce this msg
+ msg = msg.replace("not vectorized:","")
+ v.missed +=1
+ self.total_missed +=1
+ msg = sys.intern(msg)
+ if msg in v.miss_details2:
+ ls = v.miss_details2.get(msg)
+ ls.add(function)
+ else:
+ ls =set()
+ v.miss_details2[msg]=ls
+ ls.add(function)
+ return self
+
+ def add(self, line_pos, msg_x):
+ v = self.add_line(line_pos)
+ if msg_x is not None:
+ v.optimized += msg_x[0]
+ v.missed += msg_x[1]
+ self.total_opted += msg_x[0]
+ self.total_missed += msg_x[1]
+ if msg_x[2] is not None:
+ v.miss_details.add(msg_x[2])
+ return self
+
+
+ def __repr__(self):
+ return str(self.__dict__)
+
+
+
+
+def process_gzip_json_mp(args):
+ process_gzip_json_new(*args)
+
+def process_gzip_json_new(json_gz_fname,list_Queue):
+ gz_name = Path(json_gz_fname).stem
+ #print("::--open and process {0}".format(gz_name))
+ queue_count = len(list_Queue)
+ #print(queue_count)
+ q = list_Queue[0]
+ old_fname = ''
+ total_c = 0
+ for x in json_gzip_extract_objects(json_gz_fname,'message','vectorized'):
+ external_source = True
+ if len(x['message'])>0 and 'location' in x:
+ line = int(x['location']['line'])
+ file_name = x['location']['file'].strip()
+ if file_dir_strip in file_name:
+ file_name = file_name.replace(file_dir_strip,'./')
+ external_source = False
+ msg = x['message'][0]
+ success = x['kind'] == 'success'
+ func = '' if 'function' not in x else x['function']
+
+ if file_name!=old_fname:
+ #send our info to the right consumer
+ queue_ind = hash(file_name) % queue_count
+ #print("quen index {0}".format(queue_ind))
+ q =list_Queue[queue_ind]
+ old_fname = file_name
+ total_c +=1
+ #print("pp {0} {1}".format(q,(file_name,line,success, msg, func,external_source )))
+ if FSAVE_IGNORE_EXTERNALS==True and external_source == True:
+ continue
+ q.put((file_name,line,success, msg, func,external_source ))
+ print("::finished {0:60s} :{1:8d}".format(gz_name,total_c))
+
+def consume_processed_mp(args):
+ return consume_processed_new(*args)
+
+
+
+def consume_processed_new(list_Queue , c_index):
+
+ info_ = dict()
+ func_list = dict()
+ last_func_index = 0
+ q = list_Queue[c_index]
+ print("::consumer {0}".format(c_index))
+ total_c = 0
+ r_c = 0
+ while True:
+ #print("try to get new from {0}".format(index))
+ obj = q.get()
+ #print("cc {0} {1}".format(q,obj))
+ if obj==None:
+ break #we received the end
+ file_name,line,success, msg, func, external_source = obj
+ try:
+ #get function index
+ func_index = -1
+ if func in func_list:
+ func_index = func_list[func]
+ else:
+ func_list[func] = last_func_index
+ func_index = last_func_index
+ last_func_index +=1
+
+ if file_name in info_:
+ info_[file_name].add_fsave(line, success, msg, func_index)
+ else:
+ info_[file_name] = File_Info().add_fsave(line, success, msg, func_index)
+ info_[file_name].external = external_source
+ total_c +=1
+ if total_c - r_c >10000:
+ r_c = total_c
+ print("::consumer {0:2d} :{1:10d}".format(c_index,total_c))
+ except Exception as e:
+ print(traceback.format_exc())
+ break
+
+ print("::consumer {0:2d} :{1:10d}".format(c_index,total_c))
+ #write to temp file
+ wr_fname= "vecmiss_fsave{0}.html".format(str(c_index) if len(list_Queue)>1 else '')
+ print("generate report for consumer {0} {1}".format(c_index,len(info_)))
+ try:
+ uniq_ind = str(c_index)+'_' if len(list_Queue)>1 else ''
+ generate_report(wr_fname,info_ ,only_body = False, unique_id_prefix = uniq_ind,fsave_format = True, function_list= func_list)
+ print(" consumer {0} saved output into {1}".format(c_index,wr_fname))
+ except Exception as e:
+ print(traceback.format_exc())
+
+
+
+def obtain_info_from(input_):
+ info_ = dict()
+ for line in input_:
+ x = mtch.match(line)
+ external_source = True
+ if x:
+ file_name =x.group(1).strip()
+ if file_dir_strip in file_name:
+ file_name = file_name.replace(file_dir_strip,'')
+ external_source = False
+ line_number = int(x.group(2))
+ msg = x.group(4).lower()
+ msg = msg.replace(file_dir_strip,'./')
+ msg_x = get_msg(msg)
+ if msg_x is None:
+ continue
+ if file_name in info_:
+ #ignore col_number
+ info_[file_name].add(line_number,msg_x)
+ else:
+ #print("{0} {1}".format(file_name,external_source))
+ info_[file_name] = File_Info().add(line_number,msg_x)
+ info_[file_name].external = external_source
+ elif progress_msg.match(line):
+ #actually we redirect only, stderr so this should not happen
+ print("__"+line.strip())
+ elif "error" in line or "Error" in line:
+ print("****"+line.strip())
+ return info_
+
+
+
+def custom_style(fsave):
+ st = ''''''
+
+def header(fsave=False):
+ strx ='\n\n
\n\nAuto-Vectorization\n'
+ strx +=''.format(BASE_URL)
+ strx +=custom_style(fsave)
+ strx +='\n\n\n'
+ return strx
+
+def footer():
+ return '\n'
+
+
+def get_compressed_indices(set_a):
+ a_len = len(set_a)
+ if a_len<=1:
+ if a_len<1:
+ return ''
+ return str(set_a)[1:-1]
+ #we sorted and only saved difference
+ # 1,14,15,19 --> 1,13,1,4 10bytes=>8bytes
+ list_sorted = sorted(list(set_a))
+ last = list_sorted[0]
+ str_x = str(list_sorted[0])
+ for i in range(1,a_len):
+ str_x += ','+str(list_sorted[i]-last)
+ last = list_sorted[i]
+ return str_x
+
+
+
+
+
+def get_content(k, v, unique_id_prefix = '', fsave_format=False):
+ inner_str=''
+ content = ''
+ inc_id = 0
+ for fk,fv in sorted(v.infos.items()):
+ if fsave_format==True:
+ inner_str+='