Class: SVMLight::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/svmredlight/document.rb,
ext/svmredlight.c

Overview

A document is the Ruby representation of a DOC structure in SVMlight, it contains a queryid, a slackid, a costfactor ( c ) and a vector with feature numbers and their correspondent weights.

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.create(id, cost, slackid, queryid, words_ary) ⇒ Object

Creates a DOC from an array of words it also takes an id -1 is normally OK for that value when using in filtering it also takes the C (cost) parameter for the SVM. words_ary an array of arrays like this

[wnum, weight], [wnum, weight], …

so we do not waste memory, defeating the svec implementation and do

not introduce a bunch of 0’s that seem to be OK when classifying but screw all up on training



683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
# File 'ext/svmredlight.c', line 683

static VALUE
doc_create(VALUE klass, VALUE id, VALUE cost, VALUE slackid, VALUE queryid, VALUE words_ary ){
  long docnum, i, c_slackid, c_queryid;
  double c;
  WORD *words;
  SVECTOR *vec;
  DOC *d;
  VALUE inner_array;

  Check_Type(words_ary, T_ARRAY);
  Check_Type(slackid, T_FIXNUM);
  Check_Type(queryid, T_FIXNUM);

  if (RARRAY_LEN(words_ary) == 0)
    rb_raise(rb_eArgError, "Cannot create Document from empty arrays");

  words = (WORD*) my_malloc(sizeof(WORD) * (RARRAY_LEN(words_ary) + 1));

  for(i=0; i < (long)RARRAY_LEN(words_ary); i++){
    inner_array = RARRAY_PTR(words_ary)[i];
    Check_Type(inner_array, T_ARRAY);
    Check_Type(RARRAY_PTR(inner_array)[0], T_FIXNUM);

    if(!(TYPE(RARRAY_PTR(inner_array)[1]) == T_FLOAT ||  TYPE(RARRAY_PTR(inner_array)[1]) == T_FIXNUM ))
      rb_raise(rb_eArgError, "Feature weights must be numeric");
    
    if(FIX2LONG(RARRAY_PTR(inner_array)[0]) <= 0 )
      rb_raise(rb_eArgError, "Feature number has to be greater than zero");

    (words[i]).wnum     = FIX2LONG(RARRAY_PTR(inner_array)[0]);
    (words[i]).weight   = (FVAL)(NUM2DBL(RARRAY_PTR(inner_array)[1]));
  }
  words[i].wnum = 0;

  vec    = create_svector(words, (char*)"", 1.0);
  c      = NUM2DBL(cost);
  docnum = FIX2INT(id);

  d = create_example(docnum, FIX2LONG(queryid), FIX2LONG(slackid), c, vec);

  return Data_Wrap_Struct(klass, 0, doc_free, d);
}

.new(vector, opts = {}) ⇒ Object

Parameters:

  • vector (Hash)

    a hash where the keys are feature numbers and the values its weights

  • opts (Hash) (defaults to: {})

    the options coincide with SVMLight parameters to the create_example function, the default values for all the options are 0

  • [:docnum] (Hash)

    a customizable set of options

  • [:costfactor] (Hash)

    a customizable set of options

  • [:slackid] (Hash)

    a customizable set of options

  • [:queryid] (Hash)

    a customizable set of options



11
12
13
14
15
16
17
18
19
# File 'lib/svmredlight/document.rb', line 11

def self.new(vector, opts={})
  opts.default = 0
  docnum     = opts[:docnum]
  costfactor = opts[:costfactor]
  slackid    = opts[:slackid] 
  queryid    = opts[:queryid] 

  create(docnum, costfactor, slackid, queryid, vector.to_a)
end

Instance Method Details

#costfactorObject



750
751
752
753
754
755
756
# File 'ext/svmredlight.c', line 750

static VALUE
doc_get_costfactor(VALUE self){
  DOC *d;
  Data_Get_Struct(self, DOC, d);
 
  return DBL2NUM(d->costfactor);
}

#docnumObject



726
727
728
729
730
731
732
# File 'ext/svmredlight.c', line 726

static VALUE
doc_get_docnum(VALUE self){
  DOC *d;
  Data_Get_Struct(self, DOC, d);
 
  return INT2FIX(d->docnum);
}

#queryidObject



742
743
744
745
746
747
748
# File 'ext/svmredlight.c', line 742

static VALUE
doc_get_queryid(VALUE self){
  DOC *d;
  Data_Get_Struct(self, DOC, d);
 
  return INT2FIX(d->queryid);
}

#slackidObject



734
735
736
737
738
739
740
# File 'ext/svmredlight.c', line 734

static VALUE
doc_get_slackid(VALUE self){
  DOC *d;
  Data_Get_Struct(self, DOC, d);
 
  return INT2FIX(d->slackid);
}