Skip to content

Commit 3ee4695

Browse files
Improve DMatrix creation performance in python
The xgboost python python package serializes numpy arrays as json. This has non trivial overhead for small datasets. This patch optimizes the specific case where the numpy is already in "C" contigous 32-bit floating point format, and has rows*cols<=32768, and loads it directly without the json layer. xgboost/tests/python/microbench_numpy.py: Threads | Rows | Cols | Current (sec) | Optimized (sec) | Ratio 1 | 1 | 1000 | 0.0001921 | 0.0001703 | 88.6% 1 | 4 | 1000 | 0.0001689 | 0.0001437 | 85.1% 1 | 16 | 1000 | 0.0002639 | 0.0002457 | 93.1% 1 | 64 | 1000 | 0.0006843 | 0.0006719 | 98.2% 1 | 256 | 1000 | 0.002611 | 0.002655 | 101.7% 1 | 1024 | 1000 | 0.013 | 0.0126 | 97.0% 1 | 4096 | 1000 | 0.06081 | 0.0593 | 97.5% 1 | 16384 | 1000 | 0.2981 | 0.2974 | 99.8% 2 | 1 | 1000 | 0.0001415 | 0.0001196 | 84.6% 2 | 4 | 1000 | 0.0002155 | 0.0002003 | 93.0% 2 | 16 | 1000 | 0.0002137 | 0.000196 | 91.7% 2 | 64 | 1000 | 0.0005054 | 0.0004855 | 96.1% 2 | 256 | 1000 | 0.001613 | 0.001687 | 104.6% 2 | 1024 | 1000 | 0.007743 | 0.008194 | 105.8% 2 | 4096 | 1000 | 0.03791 | 0.03783 | 99.8% 2 | 16384 | 1000 | 0.2077 | 0.2037 | 98.1% 4 | 1 | 1000 | 0.0001374 | 0.0001237 | 90.0% 4 | 4 | 1000 | 0.0001985 | 0.0001621 | 81.7% 4 | 16 | 1000 | 0.0002266 | 0.0001988 | 87.7% 4 | 64 | 1000 | 0.0005175 | 0.0004775 | 92.3% 4 | 256 | 1000 | 0.00166 | 0.001594 | 96.0% 4 | 1024 | 1000 | 0.008257 | 0.008097 | 98.1% 4 | 4096 | 1000 | 0.03492 | 0.0354 | 101.4% 4 | 16384 | 1000 | 0.1896 | 0.1897 | 100.0% 8 | 1 | 1000 | 0.0001471 | 0.0001254 | 85.3% 8 | 4 | 1000 | 0.0003609 | 0.000326 | 90.4% 8 | 16 | 1000 | 0.0002651 | 0.0002217 | 83.6% 8 | 64 | 1000 | 0.0003504 | 0.0003064 | 87.5% 8 | 256 | 1000 | 0.0008264 | 0.0008729 | 105.6% 8 | 1024 | 1000 | 0.003367 | 0.003127 | 92.9% 8 | 4096 | 1000 | 0.01932 | 0.01799 | 93.1% 8 | 16384 | 1000 | 0.1245 | 0.1208 | 97.0%
1 parent 0c44067 commit 3ee4695

File tree

3 files changed

+120
-10
lines changed

3 files changed

+120
-10
lines changed

python-package/xgboost/data.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -252,17 +252,30 @@ def _from_numpy_array(
252252
_check_data_shape(data)
253253
data, _ = _ensure_np_dtype(data, data.dtype)
254254
handle = ctypes.c_void_p()
255-
_check_call(
256-
_LIB.XGDMatrixCreateFromDense(
257-
_array_interface(data),
258-
make_jcargs(
259-
missing=float(missing),
260-
nthread=int(nthread),
261-
data_split_mode=int(data_split_mode),
262-
),
263-
ctypes.byref(handle),
255+
if isinstance(data, np.ndarray) and data.dtype == np.float32 and data.flags['C_CONTIGUOUS'] and data.size <= 32768:
256+
_check_call(
257+
_LIB.XGDMatrixCreateFromMat_omp(
258+
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
259+
c_bst_ulong(data.shape[0]),
260+
c_bst_ulong(data.shape[1]),
261+
ctypes.c_float(missing),
262+
ctypes.byref(handle),
263+
ctypes.c_int(nthread),
264+
ctypes.c_int(data_split_mode),
265+
)
266+
)
267+
else:
268+
_check_call(
269+
_LIB.XGDMatrixCreateFromDense(
270+
_array_interface(data),
271+
make_jcargs(
272+
missing=float(missing),
273+
nthread=int(nthread),
274+
data_split_mode=int(data_split_mode),
275+
),
276+
ctypes.byref(handle),
277+
)
264278
)
265-
)
266279
return handle, feature_names, feature_types
267280

268281

tests/python/microbench_numpy.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
import numpy as np
2+
import xgboost as xgb
3+
from collections import defaultdict
4+
import timeit
5+
import ctypes
6+
from xgboost.core import _LIB, DataSplitMode
7+
from xgboost.data import _check_call, _array_interface, c_bst_ulong, make_jcargs
8+
9+
def measure_create_dmatrix(rows, cols, nthread, use_optimization):
10+
data = np.random.randn(rows, cols).astype(np.float32)
11+
data = np.ascontiguousarray(data)
12+
13+
handle = ctypes.c_void_p()
14+
missing = np.nan
15+
16+
start = timeit.default_timer()
17+
if use_optimization:
18+
_LIB.XGDMatrixCreateFromMat_omp(
19+
data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
20+
c_bst_ulong(data.shape[0]),
21+
c_bst_ulong(data.shape[1]),
22+
ctypes.c_float(missing),
23+
ctypes.byref(handle),
24+
ctypes.c_int(nthread),
25+
ctypes.c_int(DataSplitMode.ROW),
26+
)
27+
else:
28+
_LIB.XGDMatrixCreateFromDense(
29+
_array_interface(data),
30+
make_jcargs(
31+
missing=float(missing),
32+
nthread=int(nthread),
33+
data_split_mode=int(DataSplitMode.ROW),
34+
),
35+
ctypes.byref(handle),
36+
)
37+
end = timeit.default_timer()
38+
return end - start
39+
40+
COLS = 1000
41+
42+
print(f"{'Threads':8} | {'Rows':8} | {'Cols':8} | {'Current (sec)':15} | {'Optimized (sec)':15} | {'Ratio':12}")
43+
44+
for nthread in [1, 2, 4, 8]:
45+
for rows in [1, 4, 16, 64, 256, 1024, 4096, 16384]:
46+
repeats = 65536 // rows
47+
48+
current = 0
49+
for i in range(repeats):
50+
current += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=False)
51+
52+
optimized = 0
53+
for i in range(repeats):
54+
optimized += measure_create_dmatrix(rows=rows, cols=COLS, nthread=nthread, use_optimization=True)
55+
56+
print(f"{nthread:8} | {rows:8} | {COLS:8} | {current/repeats:15.4g} | {optimized/repeats:15.4g} | {optimized / current:12.1%}")

tests/python/test_basic.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,47 @@ def test_dmatrix_numpy_init_omp(self):
210210
assert dm.num_row() == row
211211
assert dm.num_col() == cols
212212

213+
def _test_dmatrix_numpy_init_omp_contiguous(self, test_contiguous: bool):
214+
rows = [1000, 11326, 15000]
215+
cols = 50
216+
for row in rows:
217+
X = np.random.randn(row, cols)
218+
y = np.random.randn(row).astype("f")
219+
220+
# Ensure data is contiguous
221+
if test_contiguous:
222+
X = np.ascontiguousarray(X).astype(np.float32)
223+
y = np.ascontiguousarray(y).astype(np.float32)
224+
assert X.flags['C_CONTIGUOUS']
225+
else:
226+
X = np.asfortranarray(X)
227+
y = np.asfortranarray(y)
228+
assert not X.flags['C_CONTIGUOUS']
229+
230+
dm = xgb.DMatrix(X, y, nthread=0)
231+
np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
232+
np.testing.assert_array_equal(dm.get_label(), y)
233+
assert dm.num_row() == row
234+
assert dm.num_col() == cols
235+
236+
dm = xgb.DMatrix(X, y, nthread=1)
237+
np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
238+
np.testing.assert_array_equal(dm.get_label(), y)
239+
assert dm.num_row() == row
240+
assert dm.num_col() == cols
241+
242+
dm = xgb.DMatrix(X, y, nthread=10)
243+
np.testing.assert_allclose(dm.get_data().toarray(), X, rtol=1e-7)
244+
np.testing.assert_array_equal(dm.get_label(), y)
245+
assert dm.num_row() == row
246+
assert dm.num_col() == cols
247+
248+
def test_dmatrix_numpy_init_omp_contiguous(self):
249+
return self._test_dmatrix_numpy_init_omp_contiguous(True)
250+
251+
def test_dmatrix_numpy_init_omp_not_contiguous(self):
252+
return self._test_dmatrix_numpy_init_omp_contiguous(False)
253+
213254
def test_cv(self):
214255
dm, _ = tm.load_agaricus(__file__)
215256
params = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}

0 commit comments

Comments
 (0)