I have written a code that randomly generates two matrices from dimensions 2x2 up to 50x50. I am then recording the time it takes for each matrix multiplication from dimensions 2 up to 50. I record this time 100 times to get a good average for each case 2 -50. The program first starts off by multiplying the matrices sequentially and records the average execution time in a csv file. It then moves on to parallel matrix multiplication using pthreads and records the average execution times in a separate csv file. My issue is that the average execution time for the sequential multiplication is a lot shorter than the parallel execution. For a matrix of size 50 the sequential multiplication takes 500 micro-seconds and the parallel multiplication takes 2500 micro-seconds. Is this an issue due to how I am timing the code? Or does my implementation of threads not work very well and actually cause the code to take longer to execute? I am starting the timer after the generation of the matrices and stopping it after all threads are joined together. The thread code was originally written for two matrices of uneven size so it implements a load balancing algorithm.

#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <algorithm>
#include  <vector>
#include <stdlib.h>
#include <pthread.h>
#include <cstdlib>
#include <ctime>
#include <sys/time.h>
#include <chrono>
#include <unistd.h>


using namespace std;
int n,i,j,t,k,l,MAX;
float randomnum,sum1, avg;
float matA[100][100];
float matB[100][100];
float matC[100][100];
struct Loading
{
    int r;
    int c;
    int n;
    int m;
};

// threads
pthread_t threads[100] = { 0 };

// indexes
int indexes[100] = {0};

// load balancing
Loading loads[100] = { 0 };

// for printing in thread
pthread_mutex_t M;

// run thread
void* multi(void* arg)
{
    int index = *((int*)(arg));
    Loading load = loads[index];
    int i = 0;
    int j = 0;
    int k = 0;
    int istart = load.r;
    int jstart = load.c;

    pthread_mutex_lock(&M);
     // cout << "thread #" << index << " pid: " << getpid() << " starting " << " row " << istart << " col "  << jstart << endl;
    pthread_mutex_unlock(&M);

    // logic to balance loads amongst threads using for loop
    int n = load.n;
    for (i = istart; i < MAX; i++)
    {

        for (j =jstart;n > 0 && j < MAX; j++,n--)
        {
            for (k = 0; k < MAX; k++)
            {
                matC[i][j] += matA[i][k] * matB[k][j];
            }

            pthread_mutex_lock(&M);
            //cout << "row " << i << " col "<< j << " value " << matC[i][j] << endl;
            pthread_mutex_unlock(&M);
        }

        jstart = 0;

        if (n == 0)
        {
           pthread_mutex_lock(&M);
          // cout << "thread #" << index << " pid: " << getpid() << " has completed " << endl;
           pthread_mutex_unlock(&M);
           return 0;

        }
    }


    return 0;
}

int num_threads = 0;
int MAX_THREADS= 0;




int main()
{

pthread_mutex_init(&M, NULL);


srand ( time(NULL) );

//for (n=2; n<4; n++) {
ofstream myfile;
    //  myfile.open ("/home/gage/Desktop/timing/seqrecord.csv");
myfile.open ("seqrecord.csv");
      myfile << "testtowork\n";

for (n=2; n<50; n++){
    MAX =n;
    myfile << n <<","; 
  for (int i = 0; i < MAX; i++) {
        for (int j = 0; j < MAX; j++) {
            matA[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
            matB[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
        }
  }
for(t=0; t<101; t++){
 //clock_t startTime = clock();
auto start = chrono::steady_clock::now();
  for (i = 0; i < MAX; ++i)
for (j = 0; j < MAX; ++j)
for (k = 0; k < MAX; ++k)
{
matC[i][j] += matA[i][k] * matB[k][j];
}


//int stop_s=clock();
auto end = chrono::steady_clock::now();
//cout << double( clock() - startTime ) / (double)CLOCKS_PER_SEC/1000000000<< " milli-seconds." << endl;
//cout << chrono::duration_cast<chrono::microseconds>(end - start).count() <<endl;
myfile << chrono::duration_cast<chrono::microseconds>(end - start).count() <<",";
sum1 = sum1+chrono::duration_cast<chrono::microseconds>(end - start).count();

}
avg = sum1 / 100;
myfile << "Average execution" << "," << avg << "\n";
sum1 =0;
avg = 0;


// }
}

myfile.close();
ofstream myfile1;

myfile1.open ("parallel.csv");
      myfile1 << "testtowork\n";





for (n=2; n<51; n++)
{
MAX = n;
MAX_THREADS = n*n;
num_threads =n;
 myfile1 << n <<","; 
  for (int i = 0; i < MAX; i++) {
        for (int j = 0; j < MAX; j++) {
            matA[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
            matB[i][j] = ((float(rand()) / float(RAND_MAX)) * (100 - -50)) + -50;
        }
  }
           for(t=0; t<101; t++){
         //clock_t startTime = clock();
            auto start = chrono::steady_clock::now();
         // calculade load balancing

        // cout << "calculation load balancing" << endl;

            double nwhole = (double)MAX_THREADS / num_threads;
            double last = 0;
            double sum = 0;
            int k = 0;

            loads[k].r = 0;
            loads[k].c = 0;
            loads[k].n = 0;


            while (k < num_threads)
            {

                sum = sum + nwhole;

                loads[k].n = (int)sum - (int)last;

                // check last length
                if(k == num_threads-1 && sum != MAX_THREADS)
                {
                    sum=MAX_THREADS;
                    loads[k].n=(int)sum - (int)last;
                }

                // display result
              //  cout << (int)last << " to " << (int)sum << " length: " << (int)sum - int(last) << endl;


                k++;

                if(k < num_threads)
                {
                loads[k].r = ((int)sum) / MAX;
                loads[k].c = ((int)sum) % MAX;
                }


                last = sum;


            }




        //cout << "making threads" << endl;

        void* exit_status;

        int rc;

        for( i = 0; i < num_threads ; i++ ) {
         //     cout << "main() : creating thread, " << i << endl;
              indexes[i] = i;
              rc = pthread_create(&threads[i], NULL, multi, (void *)&indexes[i]);



              if (rc) {
           //      cout << "Error:unable to create thread," << rc << endl;
                 exit(-1);
              }
           }

        // wait for threads to end
        for (j = 0; j < num_threads; j++)
        {

            pthread_join(threads[j], &exit_status);
        }

auto end = chrono::steady_clock::now();
//cout << double( clock() - startTime ) / (double)CLOCKS_PER_SEC/1000000000<< " milli-seconds." << endl;
//cout << chrono::duration_cast<chrono::microseconds>(end - start).count() <<endl;
myfile1 << chrono::duration_cast<chrono::microseconds>(end - start).count() <<",";
sum1 = sum1+chrono::duration_cast<chrono::microseconds>(end - start).count();
 }
 avg = sum1 / 100;
myfile1 << "Average" << "," << avg << "\n";
sum1 =0;
avg = 0;



       }


return 0;
}

Solution 1:

I just recently wrote an answer to a similar question SO: Eigen library with C++11 multithreading.

As I'm interested in this topic too and already had working code at hand, I adapted that sample to OP's task of matrix multiplication:

test-multi-threading-matrix.cc:

#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <thread>
#include <vector>

template <typename VALUE>
class MatrixT {
  public:
    typedef VALUE Value;
  private:
    size_t _nRows, _nCols;
    std::vector<Value> _values;
  public:
    MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
      _nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
    { }
    ~MatrixT() = default;

    size_t getNumCols() const { return _nCols; }
    size_t getNumRows() const { return _nRows; }
    Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
    const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};

template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
  const size_t n = mat1.getNumCols();
  assert(n == mat2.getNumRows());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
  return sum;
}

typedef MatrixT<double> Matrix;

typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;

Time duration(const Clock::time_point &t0)
{
  return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}

Matrix populate(size_t dim)
{
  Matrix mat(dim, dim);
  for (size_t i = 0; i < dim; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
    }
  }
  return mat;
}

std::vector<Time> makeTest(size_t dim)
{
  const size_t NThreads = std::thread::hardware_concurrency();
  const size_t nThreads = std::min(dim * dim, NThreads);
  // make a test sample
  const Matrix sampleA = populate(dim);
  const Matrix sampleB = populate(dim);
  // prepare result vectors
  Matrix results4[4] = {
    Matrix(dim, dim),
    Matrix(dim, dim),
    Matrix(dim, dim),
    Matrix(dim, dim)
  };
  // make test
  std::vector<Time> times{
    [&]() { // single threading
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[0];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment single-threaded
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dot(a, i, b, j);
      }
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - stupid aproach
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[1];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (size_t k = 0, n = dim * dim; k < n;) {
        size_t nT = 0;
        for (; nT < nThreads && k < n; ++nT, ++k) {
          const size_t i = k / dim, j = k % dim;
          threads[nT] = std::move(std::thread(
            [i, j, &results, &a, &b]() {
              results[i][j] = dot(a, i, b, j);
            }));
        }
        for (size_t iT = 0; iT < nT; ++iT) threads[iT].join();
      }
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - interleaved
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[2];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (Value iT = 0; iT < nThreads; ++iT) {
        threads[iT] = std::move(std::thread(
          [iT, dim, &results, &a, &b, nThreads]() {
            for (size_t k = iT, n = dim * dim; k < n; k += nThreads) {
              const size_t i = k / dim, j = k % dim;
              results[i][j] = dot(a, i, b, j);
            }
          }));
      }
      for (std::thread &threadI : threads) threadI.join();
      // done
      return duration(t0);
    }(),
    [&]() { // multi-threading - grouped
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results4[3];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment multi-threaded
      std::vector<std::thread> threads(nThreads);
      for (size_t iT = 0; iT < nThreads; ++iT) {
        threads[iT] = std::move(std::thread(
          [iT, dim, &results, &a, &b, nThreads]() {
            const size_t n = dim * dim;
            for (size_t k = iT * n / nThreads, kN = (iT + 1) * n / nThreads;
              k < kN; ++k) {
              const size_t i = k / dim, j = k % dim;
              results[i][j] = dot(a, i, b, j);
            }
          }));
      }
      for (std::thread &threadI : threads) threadI.join();
      // done
      return duration(t0);
    }()
  };
  // check results (must be equal for any kind of computation)
  const unsigned nResults = sizeof results4 / sizeof *results4;
  for (unsigned iResult = 1; iResult < nResults; ++iResult) {
    size_t nErrors = 0;
    for (size_t i = 0; i < dim; ++i) {
      for (size_t j = 0; j < dim; ++j) {
        if (results4[0][i][j] != results4[iResult][i][j]) {
          ++nErrors;
#if 0 // def _DEBUG
          std::cerr
            << "results4[0][" << i << "]["  << j << "]: "
            << results4[0][i][j]
            << " != results4[" << iResult << "][" << i << "][" << j << "]: "
            << results4[iResult][i][j]
            << "!\n";
#endif // _DEBUG
        }
      }
    }
    if (nErrors) std::cerr << nErrors << " errors in results4[" << iResult << "]!\n";
  }
  // done
  return times;
}

int main()
{
  std::cout << "std::thread::hardware_concurrency(): "
    << std::thread::hardware_concurrency() << '\n';
  // heat up
  std::cout << "Heat up...\n";
  for (unsigned i = 0; i < 10; ++i) makeTest(64);
  // perform tests:
  const unsigned NTrials = 10;
  for (size_t dim = 64; dim <= 512; dim *= 2) {
    std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
    // repeat NTrials times
    std::cout << "Measuring " << NTrials << " runs...\n"
      << "   "
      << " | " << std::setw(10) << "Single"
      << " | " << std::setw(10) << "Multi 1"
      << " | " << std::setw(10) << "Multi 2"
      << " | " << std::setw(10) << "Multi 3"
      << '\n';
    std::vector<double> sumTimes;
    for (unsigned i = 0; i < NTrials; ++i) {
      std::vector<Time> times = makeTest(dim);
      std::cout << std::setw(2) << (i + 1) << ".";
      for (const Time &time : times) {
        std::cout << " | " << std::setw(10) << time.count();
      }
      std::cout << '\n';
      sumTimes.resize(times.size(), 0.0);
      for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
    }
    std::cout << "Average Values:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(1)
        << sumTime / NTrials;
    }
    std::cout << '\n';
    std::cout << "Ratio:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(3)
        << sumTime / sumTimes.front();
    }
    std::cout << "\n\n";
  }
  // done
  return 0;
}

In my first tests, I started with 2×2 matrices, and doubled the number of rows and columns for each test series ending with 64×64 matrices.

I soon came to the same conclusion as Mike: these matrices are much too small. The overhead for setting up and joining threads consumes any speed-up which might've been gained by concurrency. So, I modified the test series starting with 64×64 matrices and ending with 512×512.

I compiled and run on cygwin64 (on Windows 10):

$ g++ --version
g++ (GCC) 7.3.0

$ g++ -std=c++17 -O2 test-multi-threading-matrix.cc -o test-multi-threading-matrix

$ ./test-multi-threading-matrix
std::thread::hardware_concurrency(): 8
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |        417 |     482837 |       1068 |       1080
 2. |        403 |     486775 |       1034 |       1225
 3. |        289 |     482578 |       1478 |       1151
 4. |        282 |     502703 |       1103 |       1081
 5. |        398 |     495351 |       1287 |       1124
 6. |        404 |     501426 |       1050 |       1017
 7. |        402 |     483517 |       1000 |        980
 8. |        271 |     498591 |       1092 |       1047
 9. |        284 |     494732 |        984 |       1057
10. |        288 |     494738 |       1050 |       1116
Average Values:
    |      343.8 |   492324.8 |     1114.6 |     1087.8
Ratio:
    |      1.000 |   1432.009 |      3.242 |      3.164

Test for A[128][128] * B[128][128]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |       2282 |    1995527 |       2215 |       1574
 2. |       3076 |    1954316 |       1644 |       1679
 3. |       2952 |    1981908 |       2572 |       2250
 4. |       2119 |    1986365 |       1568 |       1462
 5. |       2676 |    2212344 |       1615 |       1657
 6. |       2396 |    1981545 |       1776 |       1593
 7. |       2513 |    1983718 |       1950 |       1580
 8. |       2614 |    1852414 |       1737 |       1670
 9. |       2148 |    1955587 |       1805 |       1609
10. |       2161 |    1980772 |       1794 |       1826
Average Values:
    |     2493.7 |  1988449.6 |     1867.6 |     1690.0
Ratio:
    |      1.000 |    797.389 |      0.749 |      0.678

Test for A[256][256] * B[256][256]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |      32418 |    7992363 |      11753 |      11712
 2. |      32723 |    7961725 |      12342 |      12490
 3. |      32150 |    8041516 |      14646 |      12304
 4. |      30207 |    7810907 |      11512 |      11992
 5. |      30108 |    8005317 |      12853 |      12850
 6. |      32665 |    8064963 |      13197 |      13386
 7. |      36286 |    8825215 |      14381 |      15636
 8. |      35068 |    8015930 |      16954 |      12287
 9. |      30673 |    7973273 |      12061 |      13677
10. |      36323 |    7861856 |      14223 |      13510
Average Values:
    |    32862.1 |  8055306.5 |    13392.2 |    12984.4
Ratio:
    |      1.000 |    245.125 |      0.408 |      0.395

Test for A[512][512] * B[512][512]...
Measuring 10 runs...
    |     Single |    Multi 1 |    Multi 2 |    Multi 3
 1. |     404459 |   32803878 |     107078 |     103493
 2. |     289870 |   32482887 |      98244 |     103338
 3. |     333695 |   29398109 |      87735 |      77531
 4. |     236028 |   27286537 |      81620 |      76085
 5. |     254294 |   27418963 |      89191 |      76760
 6. |     230662 |   27278077 |      78454 |      84063
 7. |     274278 |   27180899 |      74828 |      83829
 8. |     292294 |   29942221 |     106133 |     103450
 9. |     292091 |   33011277 |     100545 |      96935
10. |     401007 |   33502134 |      98230 |      95592
Average Values:
    |   300867.8 | 30030498.2 |    92205.8 |    90107.6
Ratio:
    |      1.000 |     99.813 |      0.306 |      0.299

I did the same with VS2013 (release mode) and got similar results.

A speed-up of 3 sounds not that bad (ignoring the fact that it's still far away from 8 which you might expect as ideal for a H/W concurrency of 8).


While fiddling with the matrix multiplication, I got an idea for optimization which I wanted to check as well – even beyond multi-threading. It is the attempt to improve cache-locality.

For this, I transpose the 2nd matrix before multiplication. For the multiplication, a modified version of dot() (dotT()) is used which considers the transposition of 2nd matrix respectively.

I modified the above sample code respectively and got test-single-threading-matrix-transpose.cc:

#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <limits>
#include <vector>

template <typename VALUE>
class MatrixT {
  public:
    typedef VALUE Value;
  private:
    size_t _nRows, _nCols;
    std::vector<Value> _values;
  public:
    MatrixT(size_t nRows, size_t nCols, Value value = (Value)0):
      _nRows(nRows), _nCols(nCols), _values(_nRows * _nCols, value)
    { }
    ~MatrixT() = default;

    size_t getNumCols() const { return _nCols; }
    size_t getNumRows() const { return _nRows; }
    Value* operator[](size_t i) { return &_values[0] + i * _nCols; }
    const Value* operator[](size_t i) const { return &_values[0] + i * _nCols; }
};

template <typename VALUE>
VALUE dot(const MatrixT<VALUE> &mat1, size_t iRow, const MatrixT<VALUE> &mat2, size_t iCol)
{
  const size_t n = mat1.getNumCols();
  assert(n == mat2.getNumRows());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow][i] * mat2[i][iCol];
  return sum;
}

template <typename VALUE>
MatrixT<VALUE> transpose(const MatrixT<VALUE> mat)
{
  MatrixT<VALUE> matT(mat.getNumCols(), mat.getNumRows());
  for (size_t i = 0; i < mat.getNumRows(); ++i) {
    for (size_t j = 0; j < mat.getNumCols(); ++j) {
      matT[j][i] = mat[i][j];
    }
  }
  return matT;
}

template <typename VALUE>
VALUE dotT(const MatrixT<VALUE> &mat1, size_t iRow1, const MatrixT<VALUE> &matT2, size_t iRow2)
{
  const size_t n = mat1.getNumCols();
  assert(n == matT2.getNumCols());
  VALUE sum = (VALUE)0;
  for (size_t i = 0; i < n; ++i) sum += mat1[iRow1][i] * matT2[iRow2][i];
  return sum;
}

typedef MatrixT<double> Matrix;

typedef std::uint16_t Value;
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::microseconds MuSecs;
typedef decltype(std::chrono::duration_cast<MuSecs>(Clock::now() - Clock::now())) Time;

Time duration(const Clock::time_point &t0)
{
  return std::chrono::duration_cast<MuSecs>(Clock::now() - t0);
}

Matrix populate(size_t dim)
{
  Matrix mat(dim, dim);
  for (size_t i = 0; i < dim; ++i) {
    for (size_t j = 0; j < dim; ++j) {
      mat[i][j] = ((Matrix::Value)rand() / RAND_MAX) * 100 - 50;
    }
  }
  return mat;
}

std::vector<Time> makeTest(size_t dim)
{
  // make a test sample
  const Matrix sampleA = populate(dim);
  const Matrix sampleB = populate(dim);
  // prepare result vectors
  Matrix results2[2] = {
    Matrix(dim, dim),
    Matrix(dim, dim)
  };
  // make test
  std::vector<Time> times{
    [&]() { // single threading
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results2[0];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      // do experiment single-threaded
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dot(a, i, b, j);
      }
      // done
      return duration(t0);
    }(),
    [&]() { // single threading - with transposed matrix
      // make a copy of test sample
      const Matrix a(sampleA), b(sampleB);
      Matrix &results = results2[1];
      // remember start time
      const Clock::time_point t0 = Clock::now();
      const Matrix bT = transpose(b);
      // do experiment single-threaded with transposed B
      for (size_t k = 0, n = dim * dim; k < n; ++k) {
        const size_t i = k / dim, j = k % dim;
        results[i][j] = dotT(a, i, bT, j);
      }
      // done
      return duration(t0);
    }()
  };
  // check results (must be equal for any kind of computation)
  const unsigned nResults = sizeof results2 / sizeof *results2;
  for (unsigned iResult = 1; iResult < nResults; ++iResult) {
    size_t nErrors = 0;
    for (size_t i = 0; i < dim; ++i) {
      for (size_t j = 0; j < dim; ++j) {
        if (results2[0][i][j] != results2[iResult][i][j]) {
          ++nErrors;
#if 0 // def _DEBUG
          std::cerr
            << "results2[0][" << i << "]["  << j << "]: "
            << results2[0][i][j]
            << " != results2[" << iResult << "][" << i << "][" << j << "]: "
            << results2[iResult][i][j]
            << "!\n";
#endif // _DEBUG
        }
      }
    }
    if (nErrors) std::cerr << nErrors << " errors in results2[" << iResult << "]!\n";
  }
  // done
  return times;
}

int main()
{
  // heat up
  std::cout << "Heat up...\n";
  for (unsigned i = 0; i < 10; ++i) makeTest(64);
  // perform tests:
  const unsigned NTrials = 10;
  for (size_t dim = 64; dim <= 512; dim *= 2) {
    std::cout << "Test for A[" << dim << "][" << dim << "] * B[" << dim << "][" << dim << "]...\n";
    // repeat NTrials times
    std::cout << "Measuring " << NTrials << " runs...\n"
      << "   "
      << " | " << std::setw(10) << "A * B"
      << " | " << std::setw(10) << "A *T B^T"
      << '\n';
    std::vector<double> sumTimes;
    for (unsigned i = 0; i < NTrials; ++i) {
      std::vector<Time> times = makeTest(dim);
      std::cout << std::setw(2) << (i + 1) << ".";
      for (const Time &time : times) {
        std::cout << " | " << std::setw(10) << time.count();
      }
      std::cout << '\n';
      sumTimes.resize(times.size(), 0.0);
      for (size_t j = 0; j < times.size(); ++j) sumTimes[j] += times[j].count();
    }
    std::cout << "Average Values:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(1)
        << sumTime / NTrials;
    }
    std::cout << '\n';
    std::cout << "Ratio:\n   ";
    for (const double &sumTime : sumTimes) {
      std::cout << " | "
        << std::setw(10) << std::fixed << std::setprecision(3)
        << sumTime / sumTimes.front();
    }
    std::cout << "\n\n";
  }
  // done
  return 0;
}

I compiled and run again on cygwin64 (on Windows 10):

$ g++ -std=c++17 -O2 test-single-threading-matrix-transpose.cc -o test-single-threading-matrix-transpose && ./test-single-threading-matrix-transpose
Heat up...
Test for A[64][64] * B[64][64]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |        394 |        366
 2. |        394 |        368
 3. |        396 |        367
 4. |        382 |        368
 5. |        392 |        289
 6. |        297 |        343
 7. |        360 |        341
 8. |        399 |        358
 9. |        385 |        354
10. |        406 |        374
Average Values:
    |      380.5 |      352.8
Ratio:
    |      1.000 |      0.927

Test for A[128][128] * B[128][128]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |       2972 |       2558
 2. |       3317 |       2556
 3. |       3279 |       2689
 4. |       2952 |       2213
 5. |       2745 |       2668
 6. |       2981 |       2457
 7. |       2164 |       2274
 8. |       2634 |       2106
 9. |       2126 |       2389
10. |       3015 |       2477
Average Values:
    |     2818.5 |     2438.7
Ratio:
    |      1.000 |      0.865

Test for A[256][256] * B[256][256]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |      31312 |      17656
 2. |      29249 |      17127
 3. |      32127 |      16865
 4. |      29655 |      17287
 5. |      32137 |      17687
 6. |      29788 |      16732
 7. |      32251 |      16549
 8. |      32272 |      16257
 9. |      28019 |      18042
10. |      30334 |      17936
Average Values:
    |    30714.4 |    17213.8
Ratio:
    |      1.000 |      0.560

Test for A[512][512] * B[512][512]...
Measuring 10 runs...
    |      A * B |   A *T B^T
 1. |     322005 |     135102
 2. |     310180 |     134897
 3. |     329994 |     134304
 4. |     335375 |     137701
 5. |     330754 |     134252
 6. |     353761 |     136732
 7. |     359234 |     135632
 8. |     351498 |     134389
 9. |     360754 |     135751
10. |     368602 |     137139
Average Values:
    |   342215.7 |   135589.9
Ratio:
    |      1.000 |      0.396

Impressive, isn't it?

It achieves similar speed-ups like the above multi-threading attempts (I mean the better ones) but with a single core.

The additional effort for transposing the 2nd matrix (which is considered in measurement) does more than amortize. This isn't that surprising because there a so many more read-accesses in multiplication (which now access consecutive bytes) compared to the additional effort to construct/write the transposed matrix once.

Solution 2:

First off, your matrix sizes are way too little to multiply them in a multi-threaded manner, because the creation of the threads, context switches and joining the threads will most likely introduce overhead that takes longer than multiplying the values. For larger matrix sizes, you'll have to measure (I guess it will be around 50x50), the overhead of the threads will be low enough compared to the time of the multiplications and thus, the performance will improve.
Furthermore, you are creating way too many threads. You are creating one thread for one row of the matrix, so the overhead will be immense. If you have 4 cores on your CPU, creating more than 4 threads(including main thread) will result in an increasing overhead through context switches. What you can do here, is create few threads and distribute the data between the threads, so for example (note that I'm using std::thread for simplicity):

int a[50][50];
int b[50][50];
int c[50][50];

void multiply_part_of_matrix(int start, int end) {
  for(int i=start; i < end; ++i) {
    for (int j = 0; j < 50; ++j) {
      c[i][j] = 0;
      for(int k = 0; k < 50; ++i) {
        c[i][j] = a[i][k] * b[k][j];
      }
    }
  }
}


int main() {
  // initializes matrix
  std::vector<std::thread> threads;
  // start time
  for(int i=0; i < 5; ++i) {
    threads.emplace_back(multiply_part_of_matrix, i*10, i*10+10); 
  }

  for(int i = 0; i < 5; ++i) {
    threads.at(i).join();
  }
  // stop time
  return 0;
}

Note that it would also improve the performance if you provide some data to the main thread so that it doesn't block (overhead) while waiting on the other threads.

If you want to further improve performance, you may consider different algorithms (Strassen's algorithm) or cache optimizations for example by loop unrolling.