compute-duplication-total.py

#!/usr/bin/env python3

import argparse
import csv
import json
import numpy
import os
import pandas
import sys

from scipy.stats.mstats import gmean

def main():
  parser = argparse.ArgumentParser(description='My nice tool.')
  parser.add_argument('binaries', metavar='BINARIESFILE', help='File containing the names of the binaries')
  parser.add_argument('arch', metavar='ARCH', help='Architecture')
  parser.add_argument('function_idx_dir', metavar='FUNCTIONIDXDIR', help='Folder containing the function index')
  parser.add_argument('revng_metrics_dir', metavar='REVNGMETRICSDIR', help='Folder containing the metrics files')
  args = parser.parse_args()
  arch = args.arch
  binaries_file = args.binaries
  function_idx_dir = args.function_idx_dir
  revng_metrics_dir = args.revng_metrics_dir

  values = []
  high_duplication = []

  with open(binaries_file, 'r') as binaries:
    binaries_list = [line.rstrip('\n') for line in binaries]

  # Compute under 5% percentage
  total = 0
  under_percentage = 0

  for binary_name in binaries_list:

    bynary_values = []

    functions_csv_file = function_idx_dir + '/' + binary_name + '.csv'
    with open(functions_csv_file, 'r') as function_csv:

      csvreader = csv.reader(function_csv, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

      for function in csvreader:
        function_name = function[0]
        function_name_orig = function_name.replace('bb.', '')
        function_address = hex(int(function[1], 16))

        revng_metrics_touple = []

        revng_metrics_file = revng_metrics_dir + '/' + binary_name + '/duplication/' + function_name
        if os.path.isfile(revng_metrics_file):
          with open(revng_metrics_file, 'r') as revng_metrics:
            csvreadermetrics = csv.reader(revng_metrics, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)

            # Very hacky way to skip first line and check that the metrics file is well formed.
            linecounter = 0
            for line in csvreadermetrics:
              if linecounter == 0:
                linecounter += 1
                continue
              elif linecounter == 1:
                revng_metrics_touple = line[1:]
                linecounter += 1
              else:
                sys.exit('bad number of lines in revng metrics file: ' + binary_name + ' ' + function_name)

            duplication_factor = float(revng_metrics_touple[1])
            values.append(duplication_factor)
            if duplication_factor > 5:
              high_duplication.append((function_name, duplication_factor))

            # Compute under percentage
            if (duplication_factor < 1.01):
              under_percentage += 1
            total += 1

  print("Functions that present a very high duplication duplication:")
  for touple in high_duplication:
    print(touple)

  print("Geometric mean of the size increase:")
  print(gmean(values))

  print("Percentage of the functions that have <1% size increase:")
  print(under_percentage/total)

if __name__ == "__main__":
  main()