Skip to content

default value of lambdarank_pair_method = topk in ranking parameters not the mean #10097

@xbanke

Description

@xbanke

The document says that the default value of lambdarank_pair_method is mean, but the next code snipts show that the default value is topk. The following code runs in version '2.1.0-dev'.

import numpy as np
import pandas as pd
import xgboost as xgb

np.random.seed(42)

n_groups = 1000
group_size = 2000
n_features = 100
n_levels = 20

rows = n_groups * group_size

features = pd.DataFrame(np.random.randn(rows, n_features).astype('float32'), columns=[f'f{i:03d}' for i in range(n_features)])
qids = pd.Series(np.arange(rows, dtype='int') // group_size)
labels = pd.Series(np.random.randn(rows).astype('float32')).groupby(qids).rank(method='first').sub(1) // (group_size // n_levels)

dmatrix = xgb.DMatrix(features, label=labels, qid=qids)

keep default

params = {
    'objective': 'rank:pairwise',
    # 'objective': 'multi:softprob',
    # 'num_class': n_levels,
    
    # 'base_score': 0.5,
    # 'lambdarank_pair_method': 'mean',
    'lambdarank_normalization': False,
    'lambdarank_num_pair_per_sample': 1,
    'booster': 'gbtree',
    'tree_method': 'hist',
    'verbosity': 1,
    'seed': 42,
    'learning_rate': 0.1,
    'max_depth': 6,
    'gamma': 1,
    'min_child_weight': 4,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'nthread': 20,
    'reg_lambda': 1,
    'reg_alpha': 1,
    'eval_metric': ['ndcg@100', 'ndcg@500', 'ndcg@1000'],
}

booster = xgb.train(params, dmatrix, 100, verbose_eval=10, evals=[(dmatrix, 'train')])

# [0]	train-ndcg@100:0.10113	train-ndcg@500:0.21286	train-ndcg@1000:0.36429
# [10]	train-ndcg@100:0.10559	train-ndcg@500:0.21556	train-ndcg@1000:0.36704
# [20]	train-ndcg@100:0.10740	train-ndcg@500:0.21668	train-ndcg@1000:0.36754
# [30]	train-ndcg@100:0.10885	train-ndcg@500:0.21791	train-ndcg@1000:0.36867
# [40]	train-ndcg@100:0.10967	train-ndcg@500:0.21847	train-ndcg@1000:0.36898
# [50]	train-ndcg@100:0.11000	train-ndcg@500:0.21876	train-ndcg@1000:0.36956
# [60]	train-ndcg@100:0.11101	train-ndcg@500:0.21930	train-ndcg@1000:0.36988
# [70]	train-ndcg@100:0.11165	train-ndcg@500:0.21962	train-ndcg@1000:0.37038
# [80]	train-ndcg@100:0.11218	train-ndcg@500:0.22016	train-ndcg@1000:0.37073
# [90]	train-ndcg@100:0.11272	train-ndcg@500:0.22057	train-ndcg@1000:0.37113
# [99]	train-ndcg@100:0.11303	train-ndcg@500:0.22097	train-ndcg@1000:0.37141

use topk

params = {
    'objective': 'rank:pairwise',
    # 'objective': 'multi:softprob',
    # 'num_class': n_levels,
    
    # 'base_score': 0.5,
    'lambdarank_pair_method': 'topk',
    'lambdarank_normalization': False,
    'lambdarank_num_pair_per_sample': 1,
    'booster': 'gbtree',
    'tree_method': 'hist',
    'verbosity': 1,
    'seed': 42,
    'learning_rate': 0.1,
    'max_depth': 6,
    'gamma': 1,
    'min_child_weight': 4,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'nthread': 20,
    'reg_lambda': 1,
    'reg_alpha': 1,
    'eval_metric': ['ndcg@100', 'ndcg@500', 'ndcg@1000'],
}

booster = xgb.train(params, dmatrix, 100, verbose_eval=10, evals=[(dmatrix, 'train')])


# [0]	train-ndcg@100:0.10113	train-ndcg@500:0.21286	train-ndcg@1000:0.36429
# [10]	train-ndcg@100:0.10559	train-ndcg@500:0.21556	train-ndcg@1000:0.36704
# [20]	train-ndcg@100:0.10740	train-ndcg@500:0.21668	train-ndcg@1000:0.36754
# [30]	train-ndcg@100:0.10885	train-ndcg@500:0.21791	train-ndcg@1000:0.36867
# [40]	train-ndcg@100:0.10967	train-ndcg@500:0.21847	train-ndcg@1000:0.36898
# [50]	train-ndcg@100:0.11000	train-ndcg@500:0.21876	train-ndcg@1000:0.36956
# [60]	train-ndcg@100:0.11101	train-ndcg@500:0.21930	train-ndcg@1000:0.36988
# [70]	train-ndcg@100:0.11165	train-ndcg@500:0.21962	train-ndcg@1000:0.37038
# [80]	train-ndcg@100:0.11218	train-ndcg@500:0.22016	train-ndcg@1000:0.37073
# [90]	train-ndcg@100:0.11272	train-ndcg@500:0.22057	train-ndcg@1000:0.37113
# [99]	train-ndcg@100:0.11303	train-ndcg@500:0.22097	train-ndcg@1000:0.37141

use mean:

params = {
    'objective': 'rank:pairwise',
    # 'objective': 'multi:softprob',
    # 'num_class': n_levels,
    
    # 'base_score': 0.5,
    'lambdarank_pair_method': 'mean',
    'lambdarank_normalization': False,
    'lambdarank_num_pair_per_sample': 1,
    'booster': 'gbtree',
    'tree_method': 'hist',
    'verbosity': 1,
    'seed': 42,
    'learning_rate': 0.1,
    'max_depth': 6,
    'gamma': 1,
    'min_child_weight': 4,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'nthread': 20,
    'reg_lambda': 1,
    'reg_alpha': 1,
    'eval_metric': ['ndcg@100', 'ndcg@500', 'ndcg@1000'],
}

booster = xgb.train(params, dmatrix, 100, verbose_eval=10, evals=[(dmatrix, 'train')])

# [0]	train-ndcg@100:0.10479	train-ndcg@500:0.21641	train-ndcg@1000:0.36769
# [10]	train-ndcg@100:0.11850	train-ndcg@500:0.23082	train-ndcg@1000:0.38270
# [20]	train-ndcg@100:0.12608	train-ndcg@500:0.23867	train-ndcg@1000:0.39229
# [30]	train-ndcg@100:0.13020	train-ndcg@500:0.24464	train-ndcg@1000:0.39863
# [40]	train-ndcg@100:0.13368	train-ndcg@500:0.24896	train-ndcg@1000:0.40384
# [50]	train-ndcg@100:0.13506	train-ndcg@500:0.25252	train-ndcg@1000:0.40807
# [60]	train-ndcg@100:0.13731	train-ndcg@500:0.25628	train-ndcg@1000:0.41222
# [70]	train-ndcg@100:0.14007	train-ndcg@500:0.25916	train-ndcg@1000:0.41580
# [80]	train-ndcg@100:0.14266	train-ndcg@500:0.26243	train-ndcg@1000:0.41901
# [90]	train-ndcg@100:0.14480	train-ndcg@500:0.26488	train-ndcg@1000:0.42216
# [99]	train-ndcg@100:0.14655	train-ndcg@500:0.26781	train-ndcg@1000:0.42499

One more thing, the default value of parameter lambdarank_normalization mentioned in #9625 is True. In order to be keep compatible with the old version, I think maybe it should be False.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions