-
Notifications
You must be signed in to change notification settings - Fork 4
/
corpus.cpp
138 lines (120 loc) · 2.64 KB
/
corpus.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// part of this code in this file is taken from
// the LDA-C code from prof. David Blei.
// reading the file with lda-c format
//
#include <stdlib.h>
#include <stdio.h>
#include "corpus.h"
#include "utils.h"
void document::free_document()
{
delete [] words;
delete [] counts;
words = NULL;
counts = NULL;
length = 0;
total = 0;
id = -1;
}
void document::read_data(FILE * fileptr, bool& eof, int _id)
{
free_document();
int OFFSET = 0;
int word;
int count;
if ((fscanf(fileptr, "%10d", &length) != EOF))
{
words = new int [length];
counts = new int [length];
for (int n = 0; n < length; n++)
{
fscanf(fileptr, "%10d:%10d", &word, &count);
word = word - OFFSET;
words[n] = word;
counts[n] = count;
total += count;
}
id = _id;
eof = false;
} else {
eof = true;
}
}
int document::get_max_word_id() const
{
int * index = new int;
int max_word_id = max(words, length, index);
delete index;
return max_word_id;
}
corpus::corpus()
{
size_vocab = 0;
total_words = 0;
num_docs = 0;
}
corpus::~corpus()
{
for (int i = 0; i < num_docs; i++)
{
document * doc = docs[i];
delete doc;
}
docs.clear();
size_vocab = 0;
num_docs = 0;
total_words = 0;
}
void corpus::free_corpus()
{
for (int i = 0; i < num_docs; i++)
{
document * doc = docs[i];
delete doc;
}
docs.clear();
size_vocab = 0;
num_docs = 0;
total_words = 0;
}
void corpus::read_data(const char * filename)
{
free_corpus();
int OFFSET = 0;
FILE * fileptr;
int length, count, word, n, nd, nw;
// reading the data
printf("\nreading data from %s\n", filename);
fileptr = fopen(filename, "r");
nd = 0;
nw = 0;
while ((fscanf(fileptr, "%10d", &length) != EOF))
{
if (length > 0) {
document * doc = new document(length);
for (n = 0; n < length; n++)
{
fscanf(fileptr, "%10d:%10d", &word, &count);
word = word - OFFSET;
doc->words[n] = word;
doc->counts[n] = count;
doc->total += count;
if (word >= nw)
{
nw = word + 1;
}
}
total_words += doc->total;
doc->id = nd;
docs.push_back(doc);
nd++;
}
}
fclose(fileptr); // close the file
num_docs = nd;
size_vocab = nw;
printf("number of docs : %d\n", nd);
printf("number of terms : %d\n", nw);
printf("number of total words : %d\n", total_words);
}
// end of the file