@@ -42,7 +42,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
42
42
val pipelineModel = pipeline.fit(emptyDataSet)
43
43
val resultDf = pipelineModel.transform(emptyDataSet)
44
44
45
- assert(resultDf.count() > 1 )
45
+ assert(resultDf.count() == 1 )
46
46
}
47
47
48
48
it should " output clean flatten text without any structured metadata" taggedAs FastTest in {
@@ -52,7 +52,6 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
52
52
.setContentPath(s " $htmlFilesDirectory/example-div.html " )
53
53
.setOutputCol(" document" )
54
54
.setFlattenOutput(true )
55
- .setExplodeDocs(false )
56
55
57
56
val pipeline = new Pipeline ().setStages(Array (reader2Doc))
58
57
@@ -91,20 +90,20 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
91
90
}
92
91
}
93
92
94
- it should " convert Reader output to Document format with one row per document " taggedAs FastTest in {
93
+ it should " convert Reader output to Document format with exploded documents " taggedAs FastTest in {
95
94
96
95
val reader2Doc = new Reader2Doc ()
97
96
.setContentType(" text/html" )
98
97
.setContentPath(s " $htmlFilesDirectory/example-div.html " )
99
98
.setOutputCol(" document" )
100
- .setExplodeDocs(false )
99
+ .setExplodeDocs(true )
101
100
102
101
val pipeline = new Pipeline ().setStages(Array (reader2Doc))
103
102
104
103
val pipelineModel = pipeline.fit(emptyDataSet)
105
104
val resultDf = pipelineModel.transform(emptyDataSet)
106
105
107
- assert(resultDf.count() == 1 )
106
+ assert(resultDf.count() > 1 )
108
107
}
109
108
110
109
it should " work with Tokenizer" taggedAs FastTest in {
@@ -116,9 +115,8 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
116
115
117
116
val pipelineModel = pipeline.fit(emptyDataSet)
118
117
val resultDf = pipelineModel.transform(emptyDataSet)
119
- resultDf.select(" document" ).show(truncate = false )
120
118
121
- assert(resultDf.count() > 1 )
119
+ assert(resultDf.count() == 1 )
122
120
}
123
121
124
122
it should " work for Text documents" taggedAs FastTest in {
@@ -132,7 +130,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
132
130
val pipelineModel = pipeline.fit(emptyDataSet)
133
131
val resultDf = pipelineModel.transform(emptyDataSet)
134
132
135
- assert(resultDf.count() > 1 )
133
+ assert(resultDf.count() == 1 )
136
134
}
137
135
138
136
it should " work for Word documents" taggedAs FastTest in {
@@ -146,7 +144,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
146
144
val pipelineModel = pipeline.fit(emptyDataSet)
147
145
val resultDf = pipelineModel.transform(emptyDataSet)
148
146
149
- assert(resultDf.count() > 1 )
147
+ assert(resultDf.count() == 1 )
150
148
}
151
149
152
150
it should " work with PDF documents" taggedAs FastTest in {
@@ -160,7 +158,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
160
158
val pipelineModel = pipeline.fit(emptyDataSet)
161
159
val resultDf = pipelineModel.transform(emptyDataSet)
162
160
163
- assert(resultDf.count() > 1 )
161
+ assert(resultDf.count() == 1 )
164
162
}
165
163
166
164
it should " work with Markdown" taggedAs FastTest in {
@@ -174,7 +172,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
174
172
val pipelineModel = pipeline.fit(emptyDataSet)
175
173
val resultDf = pipelineModel.transform(emptyDataSet)
176
174
177
- assert(resultDf.count() > 1 )
175
+ assert(resultDf.count() == 1 )
178
176
}
179
177
180
178
it should " work with XML" taggedAs FastTest in {
@@ -188,7 +186,7 @@ class Reader2DocTest extends AnyFlatSpec with SparkSessionTest {
188
186
val pipelineModel = pipeline.fit(emptyDataSet)
189
187
val resultDf = pipelineModel.transform(emptyDataSet)
190
188
191
- assert(resultDf.count() > 1 )
189
+ assert(resultDf.count() == 1 )
192
190
}
193
191
194
192
it should " throw if contentPath is not set" taggedAs FastTest in {
0 commit comments