Add scripts for computing duplication

774f0ec8 · Andrea Gussoni · f8c498dc · 774f0ec8 · 774f0ec8 · 774f0ec8
Commit 774f0ec8 authored 4 years ago by Andrea Gussoni
--- a/config.sh
+++ b/config.sh
@@ -2,7 +2,10 @@
 basepath=$(pwd)
 binaries_file=$basepath/binaries.txt
 matching_script_dir=$basepath/scripting/boundaries-comparison-scripts
+computation_script_dir=$basepath/scripting/computation
 workdir=$basepath/workdir/$arch
 matching_dir=$workdir/result-matching
+function_idx_dir=$workdir/deduplicated-functions-idx-no-goto
 revng_json_dir=$workdir/revng-json
 ida_json_dir=$workdir/ida-json-normalized
+revng_metrics_dir=$workdir/revng-metrics
--- a/scripting/computation/compute-duplication-total.py
+++ b/scripting/computation/compute-duplication-total.py
+#!/usr/bin/env python3
+
+import argparse
+import csv
+import json
+import numpy
+import os
+import pandas
+import sys
+
+from scipy.stats.mstats import gmean
+
+def main():
+  parser = argparse.ArgumentParser(description='My nice tool.')
+  parser.add_argument('binaries', metavar='BINARIESFILE', help='File containing the names of the binaries')
+  parser.add_argument('arch', metavar='ARCH', help='Architecture')
+  parser.add_argument('function_idx_dir', metavar='FUNCTIONIDXDIR', help='Folder containing the function index')
+  parser.add_argument('revng_metrics_dir', metavar='REVNGMETRICSDIR', help='Folder containing the metrics files')
+  args = parser.parse_args()
+  arch = args.arch
+  binaries_file = args.binaries
+  function_idx_dir = args.function_idx_dir
+  revng_metrics_dir = args.revng_metrics_dir
+
+  values = []
+  high_duplication = []
+
+  with open(binaries_file, 'r') as binaries:
+    binaries_list = [line.rstrip('\n') for line in binaries]
+
+  # Compute under 5% percentage
+  total = 0
+  under_percentage = 0
+
+  for binary_name in binaries_list:
+
+    bynary_values = []
+
+    functions_csv_file = function_idx_dir + '/' + binary_name + '.csv'
+    with open(functions_csv_file, 'r') as function_csv:
+
+      csvreader = csv.reader(function_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+
+      for function in csvreader:
+        function_name = function[0]
+        function_name_orig = function_name.replace('bb.', '')
+        function_address = hex(int(function[1], 16))
+
+        revng_metrics_touple = []
+
+        revng_metrics_file = revng_metrics_dir + '/' + binary_name + '/duplication/' + function_name
+        if os.path.isfile(revng_metrics_file):
+          with open(revng_metrics_file, 'r') as revng_metrics:
+            csvreadermetrics = csv.reader(revng_metrics, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
+
+            # Very hacky way to skip first line and check that the metrics file is well formed.
+            linecounter = 0
+            for line in csvreadermetrics:
+              if linecounter == 0:
+                linecounter += 1
+                continue
+              elif linecounter == 1:
+                revng_metrics_touple = line[1:]
+                linecounter += 1
+              else:
+                sys.exit('bad number of lines in revng metrics file: ' + binary_name + ' ' + function_name)
+
+            duplication_factor = float(revng_metrics_touple[1])
+            values.append(duplication_factor)
+            if duplication_factor > 5:
+              high_duplication.append((function_name, duplication_factor))
+
+            # Compute under percentage
+            if (duplication_factor < 1.01):
+              under_percentage += 1
+            total += 1
+
+  print("Functions that present a very high duplication duplication:")
+  for touple in high_duplication:
+    print(touple)
+
+  print("Geometric mean of the size increase:")
+  print(gmean(values))
+
+  print("Percentage of the functions that have <1% size increase:")
+  print(under_percentage/total)
+
+if __name__ == "__main__":
+  main()
--- a/scripting/computation/compute-duplication-total.sh
+++ b/scripting/computation/compute-duplication-total.sh
+#!/bin/bash
+
+if [ $# -eq 0 ]; then
+  echo "No arguments supplied"
+  exit 1
+fi
+
+arch=$1
+
+# Import the config
+. ./config.sh
+
+$computation_script_dir/compute-duplication-total.py $binaries_file $arch $function_idx_dir $revng_metrics_dir