-
Notifications
You must be signed in to change notification settings - Fork 206
Closed
Description
With the information I gained from #58 I can now create a error report. I decided to use gdb --args python2.7 test.py to debug, because the bug only appears when using the python interface.
test.py:
import cPickle as pickle
from scipy import io,sparse
preferencesLocalArray = pickle.load(open("preferences.pickle","rb"))
features = io.mmread(open("sparse_features.mmw","rb"))
features = features.tocsc()
#shuffle pairwise preferences for train and test split
import numpy as np
np.random.seed(123L)
random_indices = np.random.permutation(preferencesLocalArray.shape[0])
preferencesLocalArrayShuffled = np.array(preferencesLocalArray[random_indices])
train_percentage = 95
trainIdx = range(int(preferencesLocalArray.shape[0]/100.0*train_percentage))
testIdx = range(int(preferencesLocalArray.shape[0]/100.0*train_percentage),preferencesLocalArray.shape[0])
posExamples = preferencesLocalArrayShuffled[testIdx,0]
negExamples = preferencesLocalArrayShuffled[testIdx,1]
from fastFM import bpr
import numpy as np
fm = bpr.FMRecommender(n_iter=500000,init_stdev=0.01,l2_reg_w=.2,l2_reg_V=1.,step_size=.01,rank=100, random_state=11)
fm.fit(features,preferencesLocalArrayShuffled[140615:140616])
When fitting BPR with a big sparse feature matrix (just user and item hot-encoded) the solver access non-existing memory here:
ffm_sgc.c
int p_n = Ap[sample_row_n];
int p_p = Ap[sample_row_p];
while (p_n < Ap[sample_row_n + 1] || p_p < Ap[sample_row_p + 1]) {
double grad = 0;
int i_to_update = Ai[p_p] <= Ai[p_n] ? Ai[p_p] : Ai[p_n];
double theta_w = coef->w->data[i_to_update];
// incrementing the smaller index or both if equal
if (Ai[p_p] == i_to_update) {
grad = Ax[p_p]; <------------------------------------------------------------ HERE
p_p++;
}
if (Ai[p_n] == i_to_update) {
grad -= Ax[p_n];
p_n++;
}
the gdb backtrace:
#0 ffm_fit_sgd_bpr (coef=coef@entry=0x7fffffffd520, A=A@entry=0x496d780, pairs=pairs@entry=0x7fffffffd4e0, param=...) at ffm_sgd.c:96
#1 0x00007fffdf7fbd34 in ffm_sgd_bpr_fit (w_0=w_0@entry=0x7fffffffd5e8, w=<optimized out>, V=<optimized out>, X=X@entry=0x496d780,
pairs=0x4a2a2a0, n_pairs=1000000, param=param@entry=0x4a1b090) at ffm.c:131
#2 0x00007fffdf7e9c10 in __pyx_pf_3ffm_10ffm_fit_sgd_bpr (__pyx_v_fm=__pyx_v_fm@entry=0x7fffd4c10650,
__pyx_v_X=__pyx_v_X@entry=0x7fffcef43a10, __pyx_v_pairs=0x7fffd4c02350, __pyx_self=<optimized out>) at fastFM/ffm.c:4748
#3 0x00007fffdf7eb5eb in __pyx_pw_3ffm_11ffm_fit_sgd_bpr (__pyx_self=<optimized out>, __pyx_args=<optimized out>,
__pyx_kwds=<optimized out>) at fastFM/ffm.c:4468
#4 0x00007ffff7af6138 in PyEval_EvalFrameEx () from /usr/lib64/libpython2.7.so.1.0
#5 0x00007ffff7af6612 in PyEval_EvalFrameEx () from /usr/lib64/libpython2.7.so.1.0
#6 0x00007ffff7af77cd in PyEval_EvalCodeEx () from /usr/lib64/libpython2.7.so.1.0
#7 0x00007ffff7af78d2 in PyEval_EvalCode () from /usr/lib64/libpython2.7.so.1.0
#8 0x00007ffff7b1055f in ?? () from /usr/lib64/libpython2.7.so.1.0
#9 0x00007ffff7b1167e in PyRun_FileExFlags () from /usr/lib64/libpython2.7.so.1.0
#10 0x00007ffff7b127e9 in PyRun_SimpleFileExFlags () from /usr/lib64/libpython2.7.so.1.0
#11 0x00007ffff7b234bf in Py_Main () from /usr/lib64/libpython2.7.so.1.0
#12 0x00007ffff6d52b15 in __libc_start_main () from /lib64/libc.so.6
#13 0x00000000004006f1 in _start ()
(gdb) frame 0
#0 ffm_fit_sgd_bpr (coef=coef@entry=0x7fffffffd520, A=A@entry=0x496d780, pairs=pairs@entry=0x7fffffffd4e0, param=...) at ffm_sgd.c:96
96 grad = Ax[p_p];
some variables that are defined in frame 0
(gdb) info locals
grad = 0
i_to_update = 0
theta_w = -0.002142975917008301
comparison_row = 140615
sample_row_p = 4977937
sample_row_n = 24502
pairs_err = -0.48447366131550462
p_n = 49005
p_p = 9956862
i = 140615
p = <optimized out>
Ap = 0x7fffc0716010
Ai = 0x7fffc1a14010
Ax = 0x7fffc4010010
step_size = 0.01
n_comparisons = 1000000
k = 100
(gdb) print A->n
$82 = 4978177
(gdb) print A->m
$83 = 1972040
(gdb) print A->p
#some tests:
(gdb) print Ax[p_n]
$58 = 1
(gdb) print Ax[p_p]
Cannot access memory at address 0x7fffc8c07000
(gdb) print Ax[p_p-1]
$59 = 0
(gdb) print Ax[p_p+1]
Cannot access memory at address 0x7fffc8c07008
(gdb) print Ax[*p_p]
$60 = 1 // I don't get why this is now possible; is it an arbitrary number from anywhere in the memory?
(gdb) print A->nzmax
$77 = 9956354
but
p_p = 9956862 which is not in range anymore!!!
The python csc matrix has in fact 9956354 one's.
(gdb) print A->n (columns (examples))
$78 = 4978177
(gdb) print A->m (rows (features))
$79 = 1972040
Things I've figured out:
- when writing the whole data to files and passing them into the cli, the training runs successful (while crashing during prediction, but this is probably another story/issue). This gives us the hint that the error might not be in the C code itself but rather in the cython code for preparing the data.
- I can crash the fitting process by passing just one specific learning pair, but the whole feature matrix. This is the exact learning pair you see in the stack above.
- CXSparse matrices member p (X->p) can be filled with two different contents which might be the difference between using cli or python:
p → Int32List
Column pointers (size n+1) or col indices (size nzmax).
https://www.dartdocs.org/documentation/csparse/0.4.1/cxsparse/Matrix-class.html
While in externals/CXSparse/Source/cs_entry.c, the member p is filled with the column id
#include "cs.h"
/* add an entry to a triplet matrix; return 1 if ok, 0 otherwise */
CS_INT cs_entry (cs *T, CS_INT i, CS_INT j, CS_ENTRY x)
{
if (!CS_TRIPLET (T) || i < 0 || j < 0) return (0) ; /* check inputs */
if (T->nz >= T->nzmax && !cs_sprealloc (T,2*(T->nzmax))) return (0) ;
if (T->x) T->x [T->nz] = x ;
T->i [T->nz] = i ;
T->p [T->nz++] = j ; <---------------------- HERE
T->m = CS_MAX (T->m, i+1) ;
T->n = CS_MAX (T->n, j+1) ;
return (1) ;
}
While in ffm.pyx the member p is filled by the matrix member indptr
# Create a CsMatrix object and return as a capsule
def CsMatrix(X not None):
cdef cffm.cs_di *p
p = <cffm.cs_di *> malloc(sizeof(cffm.cs_di))
if p == NULL:
raise MemoryError("No memory to make a Point")
cdef int i
cdef np.ndarray[int, ndim=1, mode = 'c'] indptr = X.indptr <-------------- HERE
cdef np.ndarray[int, ndim=1, mode = 'c'] indices = X.indices
cdef np.ndarray[double, ndim=1, mode = 'c'] data = X.data
# Put the scipy data into the CSparse struct. This is just copying some
# pointers.
p.nzmax = X.data.shape[0]
p.m = X.shape[0]
p.n = X.shape[1]
p.p = &indptr[0] <------------------ AND HERE
p.i = &indices[0]
p.x = &data[0]
p.nz = -1 # to indicate CSC format
return PyCapsule_New(<void *>p, "CsMatrix",
<PyCapsule_Destructor>del_CsMatrix)
which is the indptr from the csc sparse matrix format and differs in meaning from above (as far as I understood).
I hope, @ibayer , you can help me fixing this issue.
Metadata
Metadata
Assignees
Labels
No labels