Skip to content

Normalization API

Basic Normalizations

L1, L2, Max Normalizations

Normalize a sparse matrix along rows or columns using L1, L2, or max-norm.

Parameters:

Name Type Description Default
X spmatrix

Input sparse matrix.

required
norm str

Normalization method ('l1', 'l2', or 'max').

'l2'
axis int

Normalize rows (1) or columns (0).

1
inplace bool

Whether to modify the matrix in place.

False

Returns:

Type Description
csr_matrix

Normalized CSR matrix.

Source code in similaripy/normalization.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def normalize(
    X: sps.spmatrix,
    norm: str = 'l2',
    axis: int = 1,
    inplace: bool = False
) -> sps.csr_matrix:
    """
    Normalize a sparse matrix along rows or columns using L1, L2, or max-norm.

    Args:
        X: Input sparse matrix.
        norm: Normalization method ('l1', 'l2', or 'max').
        axis: Normalize rows (1) or columns (0).
        inplace: Whether to modify the matrix in place.

    Returns:
        Normalized CSR matrix.
    """
    assert(norm in _NORMALIZATIONS)
    X = check_matrix(X)

    if not inplace: 
        X = X.copy()
    if axis == 0: 
        X = X.T

    X = X.tocsr()
    if norm == 'l1':
        _norm.inplace_normalize_csr_l1(shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr)
    elif norm == 'l2':
        _norm.inplace_normalize_csr_l2(shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr)
    elif norm == 'max':
        _norm.inplace_normalize_csr_max(shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr)

    if axis == 0: X = X.T
    return X.tocsr()

TF-IDF and BM25 Families

TF-IDF

Apply TF-IDF normalization to a sparse matrix.

Parameters:

Name Type Description Default
X spmatrix

Input sparse matrix.

required
axis int

Normalize rows (1) or columns (0).

1
logbase float

Logarithm base.

e
tf_mode str

Term frequency mode.

'sqrt'
idf_mode str

Inverse document frequency mode.

'smooth'
inplace bool

Modify the matrix in place.

False

Returns:

Type Description
csr_matrix

TF-IDF normalized CSR matrix.

Source code in similaripy/normalization.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def tfidf(
    X: sps.spmatrix,
    axis: int = 1,
    logbase: float = e,
    tf_mode: str = 'sqrt',
    idf_mode: str = 'smooth',
    inplace: bool = False
) -> sps.csr_matrix:
    """
    Apply TF-IDF normalization to a sparse matrix.

    Args:
        X: Input sparse matrix.
        axis: Normalize rows (1) or columns (0).
        logbase: Logarithm base.
        tf_mode: Term frequency mode.
        idf_mode: Inverse document frequency mode.
        inplace: Modify the matrix in place.

    Returns:
        TF-IDF normalized CSR matrix.
    """
    assert(tf_mode in _TF)
    assert(idf_mode in _IDF)
    X = check_matrix(X)

    if not inplace: 
        X = X.copy()
    if axis == 0: 
        X = X.T

    X = X.tocsr()
    _norm.inplace_normalize_csr_tfidf(shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr,
                                         tf_mode=tf_mode, idf_mode=idf_mode, logbase=logbase)

    if axis == 0: 
        X = X.T
    return X.tocsr()
BM25 Normalization

Apply BM25 normalization to a sparse matrix.

Parameters:

Name Type Description Default
X spmatrix

Input sparse matrix.

required
axis int

Normalize rows (1) or columns (0).

1
k1 float

Term saturation parameter.

1.2
b float

Length normalization parameter.

0.75
logbase float

Logarithm base.

e
tf_mode str

Term frequency mode ('raw', 'log', 'sqrt', etc.).

'raw'
idf_mode str

Inverse document frequency mode ('bm25', 'smooth', etc.).

'bm25'
inplace bool

Modify the matrix in place.

False

Returns:

Type Description
csr_matrix

BM25-normalized CSR matrix.

Source code in similaripy/normalization.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def bm25(
    X: sps.spmatrix,
    axis: int = 1,
    k1: float = 1.2,
    b: float = 0.75,
    logbase: float = e,
    tf_mode: str = 'raw',
    idf_mode: str = 'bm25',
    inplace: bool = False
) -> sps.csr_matrix:
    """
    Apply BM25 normalization to a sparse matrix.

    Args:
        X: Input sparse matrix.
        axis: Normalize rows (1) or columns (0).
        k1: Term saturation parameter.
        b: Length normalization parameter.
        logbase: Logarithm base.
        tf_mode: Term frequency mode ('raw', 'log', 'sqrt', etc.).
        idf_mode: Inverse document frequency mode ('bm25', 'smooth', etc.).
        inplace: Modify the matrix in place.

    Returns:
        BM25-normalized CSR matrix.
    """
    assert(tf_mode in _TF)
    assert(idf_mode in _IDF)
    X = check_matrix(X)

    if not inplace: 
        X = X.copy()
    if axis == 0: 
        X = X.T

    X = X.tocsr()
    _norm.inplace_normalize_csr_bm25plus(shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr,
                                         k1=k1, b=b, delta=0.0,
                                         tf_mode=tf_mode, idf_mode=idf_mode, logbase=logbase)

    if axis == 0: X = X.T
    return X.tocsr()
BM25+ Normalization

Apply BM25+ normalization to a sparse matrix.

Parameters:

Name Type Description Default
X spmatrix

Input sparse matrix.

required
axis int

Normalize rows (1) or columns (0).

1
k1 float

Term saturation parameter.

1.2
b float

Length normalization parameter.

0.75
delta float

BM25+ boosting parameter.

1.0
logbase float

Logarithm base.

e
tf_mode str

Term frequency mode.

'raw'
idf_mode str

Inverse document frequency mode.

'bm25'
inplace bool

Modify the matrix in place.

False

Returns:

Type Description
csr_matrix

BM25+ normalized CSR matrix.

Source code in similaripy/normalization.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def bm25plus(
    X: sps.spmatrix,
    axis: int = 1,
    k1: float = 1.2,
    b: float = 0.75,
    delta: float = 1.0,
    logbase: float = e,
    tf_mode: str = 'raw',
    idf_mode: str = 'bm25',
    inplace: bool = False
) -> sps.csr_matrix:
    """
    Apply BM25+ normalization to a sparse matrix.

    Args:
        X: Input sparse matrix.
        axis: Normalize rows (1) or columns (0).
        k1: Term saturation parameter.
        b: Length normalization parameter.
        delta: BM25+ boosting parameter.
        logbase: Logarithm base.
        tf_mode: Term frequency mode.
        idf_mode: Inverse document frequency mode.
        inplace: Modify the matrix in place.

    Returns:
        BM25+ normalized CSR matrix.
    """
    assert(tf_mode in _TF)
    assert(idf_mode in _IDF)
    X = check_matrix(X)

    if not inplace: 
        X = X.copy()
    if axis == 0: 
        X = X.T

    X = X.tocsr()
    _norm.inplace_normalize_csr_bm25plus(shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr,
                                         k1=k1, b=b, delta=delta,
                                         tf_mode=tf_mode, idf_mode=idf_mode, logbase=logbase)

    if axis == 0: 
        X = X.T
    return X.tocsr()