CPU Performance in Java.
Перед началом оптимизации
● а это нужно вообще?
● метрики
● качественные бенчмарки
● корректность измерений
● узкое место
Откуда ноги растут?
● плохая архитектура
● проблемы с потоками
● неэффективные алгоритмы
● неразумные структуры данных
● неаккуратная работа с памятью
● потери с I/O и работой с сетью
● ...
Задача: Ускорить код
int N = 8192;
byte[][] arr = new byte[N][N];
static boolean check(byte[][] arr, int N)
{
int count = 0;
for(int i=0; i< N; i++)
for(int j=0; j< N; j++)
if(arr[j][i] < 0)
count--;
return count < 0;
}
3_744 ms
Нужно разобраться с устройством CPU
Если проще
CPU Cache
Вариант 1! Уже хорошо.
int N = 8192;
byte[][] arr = new byte[N][N];
static boolean check(byte[][] arr, int N)
{
int count = 0;
for(int i=0; i< N; i++)
for(int j=0; j< N; j++)
if(arr[i][j] < 0)
count--;
return count < 0;
}
264 ms
Вариант 2! Отлично.
int N = 8192;
byte[][] arr = new byte[N][N];
static boolean check(byte[][] arr, int N)
{
int count = 0;
for(int i=0; i< N; i++)
for(int j=0; j< N; j++)
count += arr[i][j] >> 7;
return count < 0;
}
214 ms
Итого
:3744/214=17.5
(результат зависит от CPU)
Нужно все распоточить !
Работает медленно.
public class IterationThread implements Runnable {
private int index;
private long iterations;
public IterationThread(long iterations, int index) {
this.index = index;
this.iterations = iterations;
}
@Override
public void run() {
for(long l = 0; l < iterations; ++l) {
++arr[index];
}
}
}
public class FalseSharing {
private static volatile long arr[] = new long[512];
private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
private static final long ITERATIONS = 2_000_000_000L;
public static void main(String[] args) throws Throwable {
Thread[] threads = new Thread[THREAD_COUNT];
for(int i = 0; i < THREAD_COUNT; ++i) {
threads[i] = new Thread(new IterationThread(ITERATIONS, i));
}
long start = System.currentTimeMillis();
for(Thread t: threads) {
t.start();
}
for(Thread t: threads) {
t.join();
}
System.out.println("time " + (System.currentTimeMillis() - start));
}
}
25_406 ms
Не все так просто.
False sharing.
false sharing означает доступ к разным объектам в программе, разделяющим один и тот же блок
кэш-памяти.
Работает хорошо.
public class IterationThread implements Runnable {
private int index;
private long iterations;
public IterationThread(long iterations, int index) {
this.index = index;
this.iterations = iterations;
}
@Override
public void run() {
for(long l = 0; l < iterations; ++l) {
++arr[index];
}
}
}
public class TrueSharing {
private static volatile long arr[] = new long[512];
private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
private static final long ITERATIONS = 2_000_000_000L;
public static void main(String[] args) throws Throwable {
Thread[] threads = new Thread[THREAD_COUNT];
for(int i = 0; i < THREAD_COUNT; ++i) {
threads[i] = new Thread(new IterationThread(ITERATIONS, (i+1)*8));
}
long start = System.currentTimeMillis();
for(Thread t: threads) {
t.start();
}
for(Thread t: threads) {
t.join();
}
System.out.println("time " + (System.currentTimeMillis() - start));
}
}
4_949 ms
А что если нужно использовать свой
собственный класс ???
False sharing with custom object
public static class IterationThread implements
Runnable {
private int index;
private long iterations;
public IterationThread(long iterations, int index) {
this.index = index;
this.iterations = iterations;
}
@Override
public void run() {
for(long l = 0; l < iterations; ++l) {
++arr[index].val;
}
}
}
public class FalseSharing
{
private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
private static final long ITERATIONS = 2_000_000_000L;
private static MyObject arr[] = new MyObject[THREAD_COUNT];
static {
for (int i = 0; i <arr.length; i++) {
arr[i] = new MyObject();
}
}
public static void main(String[] args) throws Throwable {
Thread[] threads = new Thread[THREAD_COUNT];
for(int i = 0; i < THREAD_COUNT; ++i) {
threads[i] = new Thread(new IterationThread(ITERATIONS, i));
}
long start = System.currentTimeMillis();
for(Thread t: threads) {
t.start();
}
for(Thread t: threads) {
t.join();
}
System.out.println("time " + (System.currentTimeMillis() - start));
}
}
149_743 ms
public static class MyObject{
public volatile long val = 0L;
}
Java 7 Padding.
public static class IterationThread implements
Runnable {
private int index;
private long iterations;
public IterationThread(long iterations, int index) {
this.index = index;
this.iterations = iterations;
}
@Override
public void run() {
for(long l = 0; l < iterations; ++l) {
arr[index].incrementAndGet();
}
}
}
public class FalseSharing
{
private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
private static final long ITERATIONS = 2_000_000_000L;
private static MyObject arr[] = new MyObject[THREAD_COUNT];
static {
for (int i = 0; i <arr.length; i++) {
arr[i] = new MyObject();
}
}
public static void main(String[] args) throws Throwable {
Thread[] threads = new Thread[THREAD_COUNT];
for(int i = 0; i < THREAD_COUNT; ++i) {
threads[i] = new Thread(new IterationThread(ITERATIONS, i));
}
long start = System.currentTimeMillis();
for(Thread t: threads) {
t.start();
}
for(Thread t: threads) {
t.join();
}
System.out.println("time " + (System.currentTimeMillis() - start));
}
}
14_539 ms
public static class MyObject extends AtomicLong {
public volatile long p1, p2, p3, p4, p5, p6 = 7L;
}
Java 8. @sun.misc.Contended
public static class IterationThread implements
Runnable {
private int index;
private long iterations;
public IterationThread(long iterations, int index) {
this.index = index;
this.iterations = iterations;
}
@Override
public void run() {
for(long l = 0; l < iterations; ++l) {
arr[index].incrementAndGet();
}
}
}
public class FalseSharing
{
private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors();
private static final long ITERATIONS = 2_000_000_000L;
private static MyObject arr[] = new MyObject[THREAD_COUNT];
static {
for (int i = 0; i <arr.length; i++) {
arr[i] = new MyObject();
}
}
public static void main(String[] args) throws Throwable {
Thread[] threads = new Thread[THREAD_COUNT];
for(int i = 0; i < THREAD_COUNT; ++i) {
threads[i] = new Thread(new IterationThread(ITERATIONS, i));
}
long start = System.currentTimeMillis();
for(Thread t: threads) {
t.start();
}
for(Thread t: threads) {
t.join();
}
System.out.println("time " + (System.currentTimeMillis() - start));
}
}
14_983 ms
// unlock JVM option: -XX:-RestrictContended
@Contended
public static class MyObject extends AtomicLong {
public volatile long anyVal;
}
Спасибо за внимание

CPU Performance in Java.

  • 1.
  • 2.
    Перед началом оптимизации ●а это нужно вообще? ● метрики ● качественные бенчмарки ● корректность измерений ● узкое место
  • 3.
    Откуда ноги растут? ●плохая архитектура ● проблемы с потоками ● неэффективные алгоритмы ● неразумные структуры данных ● неаккуратная работа с памятью ● потери с I/O и работой с сетью ● ...
  • 4.
    Задача: Ускорить код intN = 8192; byte[][] arr = new byte[N][N]; static boolean check(byte[][] arr, int N) { int count = 0; for(int i=0; i< N; i++) for(int j=0; j< N; j++) if(arr[j][i] < 0) count--; return count < 0; } 3_744 ms
  • 5.
    Нужно разобраться сустройством CPU
  • 6.
  • 7.
  • 8.
    Вариант 1! Ужехорошо. int N = 8192; byte[][] arr = new byte[N][N]; static boolean check(byte[][] arr, int N) { int count = 0; for(int i=0; i< N; i++) for(int j=0; j< N; j++) if(arr[i][j] < 0) count--; return count < 0; } 264 ms
  • 9.
    Вариант 2! Отлично. intN = 8192; byte[][] arr = new byte[N][N]; static boolean check(byte[][] arr, int N) { int count = 0; for(int i=0; i< N; i++) for(int j=0; j< N; j++) count += arr[i][j] >> 7; return count < 0; } 214 ms
  • 10.
  • 11.
  • 12.
    Работает медленно. public classIterationThread implements Runnable { private int index; private long iterations; public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; } @Override public void run() { for(long l = 0; l < iterations; ++l) { ++arr[index]; } } } public class FalseSharing { private static volatile long arr[] = new long[512]; private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L; public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT]; for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); } } 25_406 ms
  • 13.
    Не все такпросто. False sharing. false sharing означает доступ к разным объектам в программе, разделяющим один и тот же блок кэш-памяти.
  • 14.
    Работает хорошо. public classIterationThread implements Runnable { private int index; private long iterations; public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; } @Override public void run() { for(long l = 0; l < iterations; ++l) { ++arr[index]; } } } public class TrueSharing { private static volatile long arr[] = new long[512]; private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L; public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT]; for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, (i+1)*8)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); } } 4_949 ms
  • 15.
    А что еслинужно использовать свой собственный класс ???
  • 16.
    False sharing withcustom object public static class IterationThread implements Runnable { private int index; private long iterations; public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; } @Override public void run() { for(long l = 0; l < iterations; ++l) { ++arr[index].val; } } } public class FalseSharing { private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L; private static MyObject arr[] = new MyObject[THREAD_COUNT]; static { for (int i = 0; i <arr.length; i++) { arr[i] = new MyObject(); } } public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT]; for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); } } 149_743 ms public static class MyObject{ public volatile long val = 0L; }
  • 17.
    Java 7 Padding. publicstatic class IterationThread implements Runnable { private int index; private long iterations; public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; } @Override public void run() { for(long l = 0; l < iterations; ++l) { arr[index].incrementAndGet(); } } } public class FalseSharing { private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L; private static MyObject arr[] = new MyObject[THREAD_COUNT]; static { for (int i = 0; i <arr.length; i++) { arr[i] = new MyObject(); } } public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT]; for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); } } 14_539 ms public static class MyObject extends AtomicLong { public volatile long p1, p2, p3, p4, p5, p6 = 7L; }
  • 18.
    Java 8. @sun.misc.Contended publicstatic class IterationThread implements Runnable { private int index; private long iterations; public IterationThread(long iterations, int index) { this.index = index; this.iterations = iterations; } @Override public void run() { for(long l = 0; l < iterations; ++l) { arr[index].incrementAndGet(); } } } public class FalseSharing { private static final int THREAD_COUNT = Runtime.getRuntime().availableProcessors(); private static final long ITERATIONS = 2_000_000_000L; private static MyObject arr[] = new MyObject[THREAD_COUNT]; static { for (int i = 0; i <arr.length; i++) { arr[i] = new MyObject(); } } public static void main(String[] args) throws Throwable { Thread[] threads = new Thread[THREAD_COUNT]; for(int i = 0; i < THREAD_COUNT; ++i) { threads[i] = new Thread(new IterationThread(ITERATIONS, i)); } long start = System.currentTimeMillis(); for(Thread t: threads) { t.start(); } for(Thread t: threads) { t.join(); } System.out.println("time " + (System.currentTimeMillis() - start)); } } 14_983 ms // unlock JVM option: -XX:-RestrictContended @Contended public static class MyObject extends AtomicLong { public volatile long anyVal; }
  • 19.