Improve DMatrix creation performance in python

arieleizenberg · arieleizenberg · commit 3ee46952cd4b · 2024-06-11T00:47:55.000+03:00
The xgboost python python package serializes numpy arrays as json.
This has non trivial overhead for small datasets.

This patch optimizes the specific case where the numpy is already in
"C" contigous 32-bit floating point format, and has rows*cols&lt;=32768,
and loads it directly without the json layer.
xgboost/tests/python/microbench_numpy.py:

Threads  | Rows     | Cols     | Current (sec)   | Optimized (sec) | Ratio
       1 |        1 |     1000 |       0.0001921 |       0.0001703 |        88.6%
       1 |        4 |     1000 |       0.0001689 |       0.0001437 |        85.1%
       1 |       16 |     1000 |       0.0002639 |       0.0002457 |        93.1%
       1 |       64 |     1000 |       0.0006843 |       0.0006719 |        98.2%
       1 |      256 |     1000 |        0.002611 |        0.002655 |       101.7%
       1 |     1024 |     1000 |           0.013 |          0.0126 |        97.0%
       1 |     4096 |     1000 |         0.06081 |          0.0593 |        97.5%
       1 |    16384 |     1000 |          0.2981 |          0.2974 |        99.8%
       2 |        1 |     1000 |       0.0001415 |       0.0001196 |        84.6%
       2 |        4 |     1000 |       0.0002155 |       0.0002003 |        93.0%
       2 |       16 |     1000 |       0.0002137 |        0.000196 |        91.7%
       2 |       64 |     1000 |       0.0005054 |       0.0004855 |        96.1%
       2 |      256 |     1000 |        0.001613 |        0.001687 |       104.6%
       2 |     1024 |     1000 |        0.007743 |        0.008194 |       105.8%
       2 |     4096 |     1000 |         0.03791 |         0.03783 |        99.8%
       2 |    16384 |     1000 |          0.2077 |          0.2037 |        98.1%
       4 |        1 |     1000 |       0.0001374 |       0.0001237 |        90.0%
       4 |        4 |     1000 |       0.0001985 |       0.0001621 |        81.7%
       4 |       16 |     1000 |       0.0002266 |       0.0001988 |        87.7%
       4 |       64 |     1000 |       0.0005175 |       0.0004775 |        92.3%
       4 |      256 |     1000 |         0.00166 |        0.001594 |        96.0%
       4 |     1024 |     1000 |        0.008257 |        0.008097 |        98.1%
       4 |     4096 |     1000 |         0.03492 |          0.0354 |       101.4%
       4 |    16384 |     1000 |          0.1896 |          0.1897 |       100.0%
       8 |        1 |     1000 |       0.0001471 |       0.0001254 |        85.3%
       8 |        4 |     1000 |       0.0003609 |        0.000326 |        90.4%
       8 |       16 |     1000 |       0.0002651 |       0.0002217 |        83.6%
       8 |       64 |     1000 |       0.0003504 |       0.0003064 |        87.5%
       8 |      256 |     1000 |       0.0008264 |       0.0008729 |       105.6%
       8 |     1024 |     1000 |        0.003367 |        0.003127 |        92.9%
       8 |     4096 |     1000 |         0.01932 |         0.01799 |        93.1%
       8 |    16384 |     1000 |          0.1245 |          0.1208 |        97.0%
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -252,17 +252,30 @@ def _from_numpy_array(
     _check_data_shape(data)
     data, _ = _ensure_np_dtype(data, data.dtype)
     handle = ctypes.c_void_p()
-    _check_call(
-        _LIB.XGDMatrixCreateFromDense(
-            _array_interface(data),
-            make_jcargs(
-                missing=float(missing),
-                nthread=int(nthread),
-                data_split_mode=int(data_split_mode),
-            ),
-            ctypes.byref(handle),
+    if isinstance(data, np.ndarray) and data.dtype == np.float32 and data.flags['C_CONTIGUOUS'] and data.size <= 32768:
+        _check_call(
+            _LIB.XGDMatrixCreateFromMat_omp(
+                data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                c_bst_ulong(data.shape[0]),
+                c_bst_ulong(data.shape[1]),
+                ctypes.c_float(missing),
+                ctypes.byref(handle),
+                ctypes.c_int(nthread),
+                ctypes.c_int(data_split_mode),
+            )
+        )
+    else:
+        _check_call(
+            _LIB.XGDMatrixCreateFromDense(
+                _array_interface(data),
+                make_jcargs(
+                    missing=float(missing),
+                    nthread=int(nthread),
+                    data_split_mode=int(data_split_mode),
+                ),
+                ctypes.byref(handle),
+            )
         )
-    )
     return handle, feature_names, feature_types
 
 
diff --git a/tests/python/microbench_numpy.py b/tests/python/microbench_numpy.py
@@ -0,0 +1,56 @@
+import numpy as np
+import xgboost as xgb
+from collections import defaultdict
+import timeit
+import ctypes
+from xgboost.core import _LIB, DataSplitMode
+from xgboost.data import _check_call, _array_interface, c_bst_ulong, make_jcargs
+
+def measure_create_dmatrix(rows, cols, nthread, use_optimization):
+    data =  np.random.randn(rows, cols).astype(np.float32)
+    data = np.ascontiguousarray(data)
+
+    handle = ctypes.c_void_p()
+    missing = np.nan
+
+    start = timeit.default_timer()
+    if use_optimization:
+        _LIB.XGDMatrixCreateFromMat_omp(
+            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            c_bst_ulong(data.shape[0]),
+            c_bst_ulong(data.shape[1]),
+            ctypes.c_float(missing),
+            ctypes.byref(handle),
+            ctypes.c_int(nthread),
+            ctypes.c_int(DataSplitMode.ROW),
+        )
+    else:
+        _LIB.XGDMatrixCreateFromDense(
+            _array_interface(data),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(DataSplitMode.ROW),
+            ),
+            ctypes.byref(handle),
+        )
+    end = timeit.default_timer()
+    return end - start
+
+COLS = 1000
+
+print(f"{'Threads':8} | {'Rows':8} | {'Cols':8} | {'Current (sec)':15} | {'Optimized (sec)':15} | {'Ratio':12}")
+
+for nthread in [1, 2, 4, 8]:
+    for rows in [1, 4, 16, 64, 256, 1024, 4096, 16384]:
+        repeats = 65536 // rows
+
+        current = 0
+        for i in range(repeats):
+            current += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=False)
+
+        optimized = 0
+        for i in range(repeats):
+            optimized += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=True)
+
+        print(f"{nthread:8} | {rows:8} | {COLS:8} | {current/repeats:15.4g} | {optimized/repeats:15.4g} | {optimized / current:12.1%}")
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
@@ -210,6 +210,47 @@ def test_dmatrix_numpy_init_omp(self):
             assert dm.num_row() == row
             assert dm.num_col() == cols
 
+    def _test_dmatrix_numpy_init_omp_contiguous(self, test_contiguous: bool):
+        rows = [1000, 11326, 15000]
+        cols = 50
+        for row in rows:
+            X = np.random.randn(row, cols)
+            y = np.random.randn(row).astype("f")
+
+            # Ensure data is contiguous
+            if test_contiguous:
+                X = np.ascontiguousarray(X).astype(np.float32)
+                y = np.ascontiguousarray(y).astype(np.float32)
+                assert X.flags['C_CONTIGUOUS']
+            else:
+                X = np.asfortranarray(X)
+                y = np.asfortranarray(y)
+                assert not X.flags['C_CONTIGUOUS']
+
+            dm = xgb.DMatrix(X, y, nthread=0)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+            dm = xgb.DMatrix(X, y, nthread=1)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+            dm = xgb.DMatrix(X, y, nthread=10)
+            np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
+            np.testing.assert_array_equal(dm.get_label(), y)
+            assert dm.num_row() == row
+            assert dm.num_col() == cols
+
+    def test_dmatrix_numpy_init_omp_contiguous(self):
+        return self._test_dmatrix_numpy_init_omp_contiguous(True)
+
+    def test_dmatrix_numpy_init_omp_not_contiguous(self):
+        return self._test_dmatrix_numpy_init_omp_contiguous(False)
+
     def test_cv(self):
         dm, _ = tm.load_agaricus(__file__)
         params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}