Skip to content

Normalization API

L1, L2, Max Normalizations

normalize

normalize(X: sparray, norm: str = 'l2', axis: int = 1, inplace: bool = False) -> sps.csr_array

Normalize a sparse matrix along rows or columns using L1, L2, or max-norm.

Parameters:

Name Type Description Default
X sparray

Input sparse matrix.

required
norm str

Normalization method ('l1', 'l2', or 'max').

'l2'
axis int

Normalize rows (1) or columns (0).

1
inplace bool

Whether to modify the matrix in place.

False

Returns:

Type Description
csr_array

Normalized CSR matrix.

Source code in similaripy/normalization.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def normalize(
    X: sps.sparray,
    norm: str = 'l2',
    axis: int = 1,
    inplace: bool = False,
) -> sps.csr_array:
    """
    Normalize a sparse matrix along rows or columns using L1, L2, or max-norm.

    Args:
        X: Input sparse matrix.
        norm: Normalization method ('l1', 'l2', or 'max').
        axis: Normalize rows (1) or columns (0).
        inplace: Whether to modify the matrix in place.

    Returns:
        Normalized CSR matrix.
    """
    if norm not in _NORMALIZATIONS:
        raise ValueError(f"norm must be one of {_NORMALIZATIONS}, got '{norm}'")
    X = _prepare_csr(X, axis, inplace)
    _NORM_DISPATCH[norm](shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr)
    return _finalize_csr(X, axis)
TF-IDF

tfidf

tfidf(X: sparray, axis: int = 1, logbase: float = e, tf_mode: str = 'sqrt', idf_mode: str = 'smooth', inplace: bool = False) -> sps.csr_array

Apply TF-IDF normalization to a sparse matrix.

Parameters:

Name Type Description Default
X sparray

Input sparse matrix.

required
axis int

Normalize rows (1) or columns (0).

1
logbase float

Logarithm base.

e
tf_mode str

Term frequency mode.

'sqrt'
idf_mode str

Inverse document frequency mode.

'smooth'
inplace bool

Modify the matrix in place.

False

Returns:

Type Description
csr_array

TF-IDF normalized CSR matrix.

Source code in similaripy/normalization.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
def tfidf(
    X: sps.sparray,
    axis: int = 1,
    logbase: float = e,
    tf_mode: str = 'sqrt',
    idf_mode: str = 'smooth',
    inplace: bool = False,
) -> sps.csr_array:
    """
    Apply TF-IDF normalization to a sparse matrix.

    Args:
        X: Input sparse matrix.
        axis: Normalize rows (1) or columns (0).
        logbase: Logarithm base.
        tf_mode: Term frequency mode.
        idf_mode: Inverse document frequency mode.
        inplace: Modify the matrix in place.

    Returns:
        TF-IDF normalized CSR matrix.
    """
    _validate_modes(tf_mode, idf_mode)
    X = _prepare_csr(X, axis, inplace)
    _norm.inplace_normalize_csr_tfidf(
        shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr,
        tf_mode=tf_mode, idf_mode=idf_mode, logbase=logbase,
    )
    return _finalize_csr(X, axis)
BM25

bm25

bm25(X: sparray, axis: int = 1, k1: float = 1.2, b: float = 0.75, logbase: float = e, tf_mode: str = 'raw', idf_mode: str = 'bm25', inplace: bool = False) -> sps.csr_array

Apply BM25 normalization to a sparse matrix.

Parameters:

Name Type Description Default
X sparray

Input sparse matrix.

required
axis int

Normalize rows (1) or columns (0).

1
k1 float

Term saturation parameter.

1.2
b float

Length normalization parameter.

0.75
logbase float

Logarithm base.

e
tf_mode str

Term frequency mode ('raw', 'log', 'sqrt', etc.).

'raw'
idf_mode str

Inverse document frequency mode ('bm25', 'smooth', etc.).

'bm25'
inplace bool

Modify the matrix in place.

False

Returns:

Type Description
csr_array

BM25-normalized CSR matrix.

Source code in similaripy/normalization.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def bm25(
    X: sps.sparray,
    axis: int = 1,
    k1: float = 1.2,
    b: float = 0.75,
    logbase: float = e,
    tf_mode: str = 'raw',
    idf_mode: str = 'bm25',
    inplace: bool = False,
) -> sps.csr_array:
    """
    Apply BM25 normalization to a sparse matrix.

    Args:
        X: Input sparse matrix.
        axis: Normalize rows (1) or columns (0).
        k1: Term saturation parameter.
        b: Length normalization parameter.
        logbase: Logarithm base.
        tf_mode: Term frequency mode ('raw', 'log', 'sqrt', etc.).
        idf_mode: Inverse document frequency mode ('bm25', 'smooth', etc.).
        inplace: Modify the matrix in place.

    Returns:
        BM25-normalized CSR matrix.
    """
    _validate_modes(tf_mode, idf_mode)
    X = _prepare_csr(X, axis, inplace)
    _norm.inplace_normalize_csr_bm25plus(
        shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr,
        k1=k1, b=b, delta=0.0,
        tf_mode=tf_mode, idf_mode=idf_mode, logbase=logbase,
    )
    return _finalize_csr(X, axis)
BM25+

bm25plus

bm25plus(X: sparray, axis: int = 1, k1: float = 1.2, b: float = 0.75, delta: float = 1.0, logbase: float = e, tf_mode: str = 'raw', idf_mode: str = 'bm25', inplace: bool = False) -> sps.csr_array

Apply BM25+ normalization to a sparse matrix.

Parameters:

Name Type Description Default
X sparray

Input sparse matrix.

required
axis int

Normalize rows (1) or columns (0).

1
k1 float

Term saturation parameter.

1.2
b float

Length normalization parameter.

0.75
delta float

BM25+ boosting parameter.

1.0
logbase float

Logarithm base.

e
tf_mode str

Term frequency mode.

'raw'
idf_mode str

Inverse document frequency mode.

'bm25'
inplace bool

Modify the matrix in place.

False

Returns:

Type Description
csr_array

BM25+ normalized CSR matrix.

Source code in similaripy/normalization.py
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def bm25plus(
    X: sps.sparray,
    axis: int = 1,
    k1: float = 1.2,
    b: float = 0.75,
    delta: float = 1.0,
    logbase: float = e,
    tf_mode: str = 'raw',
    idf_mode: str = 'bm25',
    inplace: bool = False,
) -> sps.csr_array:
    """
    Apply BM25+ normalization to a sparse matrix.

    Args:
        X: Input sparse matrix.
        axis: Normalize rows (1) or columns (0).
        k1: Term saturation parameter.
        b: Length normalization parameter.
        delta: BM25+ boosting parameter.
        logbase: Logarithm base.
        tf_mode: Term frequency mode.
        idf_mode: Inverse document frequency mode.
        inplace: Modify the matrix in place.

    Returns:
        BM25+ normalized CSR matrix.
    """
    _validate_modes(tf_mode, idf_mode)
    X = _prepare_csr(X, axis, inplace)
    _norm.inplace_normalize_csr_bm25plus(
        shape=X.shape, data=X.data, indices=X.indices, indptr=X.indptr,
        k1=k1, b=b, delta=delta,
        tf_mode=tf_mode, idf_mode=idf_mode, logbase=logbase,
    )
    return _finalize_csr(X, axis)