001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading; 022 023import java.io.Serializable; 024import java.util.ArrayList; 025import java.util.Collection; 026import java.util.Collections; 027import java.util.Comparator; 028import java.util.HashMap; 029import java.util.List; 030import java.util.Map; 031import java.util.regex.Pattern; 032 033import cascading.cascade.Cascades; 034import cascading.flow.Flow; 035import cascading.operation.Debug; 036import cascading.operation.Filter; 037import cascading.operation.Function; 038import cascading.operation.Identity; 039import cascading.operation.Insert; 040import cascading.operation.NoOp; 041import cascading.operation.aggregator.Count; 042import cascading.operation.aggregator.First; 043import cascading.operation.expression.ExpressionFunction; 044import cascading.operation.filter.And; 045import cascading.operation.function.UnGroup; 046import cascading.operation.regex.RegexFilter; 047import cascading.operation.regex.RegexParser; 048import cascading.operation.regex.RegexSplitter; 049import cascading.pipe.Each; 050import cascading.pipe.Every; 051import cascading.pipe.GroupBy; 052import cascading.pipe.Merge; 053import cascading.pipe.Pipe; 054import cascading.tap.MultiSourceTap; 055import cascading.tap.SinkMode; 056import cascading.tap.Tap; 057import cascading.tuple.Fields; 058import cascading.tuple.Hasher; 059import cascading.tuple.Tuple; 060import org.junit.Test; 061 062import static cascading.ComparePlatformsTest.NONDETERMINISTIC; 063import static data.InputData.*; 064 065public class FieldedPipesPlatformTest extends PlatformTestCase 066 { 067 public FieldedPipesPlatformTest() 068 { 069 super( true, 5, 3 ); // leave cluster testing enabled 070 } 071 072 @Test 073 public void testSimpleGroup() throws Exception 074 { 075 getPlatform().copyFromLocal( inputFileApache ); 076 077 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 078 079 Pipe pipe = new Pipe( "test" ); 080 081 pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); 082 083 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 084 085 pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); 086 087 Tap sink = getPlatform().getTextFile( getOutputPath( "simple" ), SinkMode.REPLACE ); 088 089 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 090 091 flow.complete(); 092 093 validateLength( flow.openSource(), 10 ); // validate source, this once, as a sanity check 094 validateLength( flow, 8, null ); 095 } 096 097 @Test 098 public void testSimpleChain() throws Exception 099 { 100 getPlatform().copyFromLocal( inputFileApache ); 101 102 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 103 104 Pipe pipe = new Pipe( "test" ); 105 106 pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); 107 108 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 109 110 pipe = new Every( pipe, new Count( new Fields( "count1" ) ) ); 111 pipe = new Every( pipe, new Count( new Fields( "count2" ) ) ); 112 pipe = new Every( pipe, new Count( new Fields( "count3" ) ) ); 113 pipe = new Every( pipe, new Count( new Fields( "count4" ) ) ); 114 115 Tap sink = getPlatform().getTabDelimitedFile( Fields.ALL, getOutputPath( "simplechain" ), SinkMode.REPLACE ); 116 117 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 118 119 flow.complete(); 120 121 validateLength( flow, 8, 5 ); 122 } 123 124 @Test 125 public void testChainEndingWithEach() throws Exception 126 { 127 getPlatform().copyFromLocal( inputFileApache ); 128 129 Pipe pipe = new Pipe( "test" ); 130 131 pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); 132 133 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 134 135 pipe = new Every( pipe, new Count( new Fields( "count1" ) ) ); 136 pipe = new Every( pipe, new Count( new Fields( "count2" ) ) ); 137 138 pipe = new Each( pipe, new Fields( "count1", "count2" ), new ExpressionFunction( new Fields( "sum" ), "count1 + count2", int.class ), Fields.ALL ); 139 140 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 141 Tap sink = getPlatform().getTextFile( getOutputPath( "chaineach" ), SinkMode.REPLACE ); 142 143 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 144 145 flow.complete(); 146 147 validateLength( flow, 8, null ); 148 } 149 150 // also tests the RegexSplitter 151 152 @Test 153 public void testNoGroup() throws Exception 154 { 155 getPlatform().copyFromLocal( inputFileApache ); 156 157 Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileApache ); 158 159 Pipe pipe = new Pipe( "test" ); 160 161 pipe = new Each( pipe, new RegexSplitter( "\\s+" ), new Fields( 1 ) ); 162 163 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "nogroup" ), SinkMode.REPLACE ); 164 165 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 166 167 flow.complete(); 168 169 validateLength( flow, 10, null ); 170 171 List<Tuple> results = getSinkAsList( flow ); 172 173 assertTrue( results.contains( new Tuple( "75.185.76.245" ) ) ); 174 } 175 176 @Test 177 public void testCopy() throws Exception 178 { 179 getPlatform().copyFromLocal( inputFileApache ); 180 181 Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileApache ); 182 183 Pipe pipe = new Pipe( "test" ); 184 185 Tap sink = getPlatform().getTextFile( getOutputPath( "copy" ), SinkMode.REPLACE ); 186 187 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 188 189 flow.complete(); 190 191 validateLength( flow, 10, null ); 192 } 193 194 @Test 195 public void testSimpleMerge() throws Exception 196 { 197 getPlatform().copyFromLocal( inputFileLower ); 198 getPlatform().copyFromLocal( inputFileUpper ); 199 200 Tap sourceLower = getPlatform().getTextFile( inputFileLower ); 201 Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); 202 203 Map sources = new HashMap(); 204 205 sources.put( "lower", sourceLower ); 206 sources.put( "upper", sourceUpper ); 207 208 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 209 210 // using null pos so all fields are written 211 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "simplemerge" ), SinkMode.REPLACE ); 212 213 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 214 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 215 216 Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ), null, false ); 217 218 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 219 220 flow.complete(); 221 222 validateLength( flow, 10 ); 223 224 Collection results = getSinkAsList( flow ); 225 226 assertTrue( "missing value", results.contains( new Tuple( "1\ta" ) ) ); 227 assertTrue( "missing value", results.contains( new Tuple( "1\tA" ) ) ); 228 assertTrue( "missing value", results.contains( new Tuple( "2\tb" ) ) ); 229 assertTrue( "missing value", results.contains( new Tuple( "2\tB" ) ) ); 230 assertTrue( "missing value", results.contains( new Tuple( "3\tc" ) ) ); 231 assertTrue( "missing value", results.contains( new Tuple( "3\tC" ) ) ); 232 } 233 234 /** 235 * Specifically tests GroupBy will return the correct grouping fields to the following Every 236 * <p> 237 * additionally tests secondary sorting during merging 238 * 239 * @throws Exception 240 */ 241 @Test 242 public void testSimpleMergeThree() throws Exception 243 { 244 getPlatform().copyFromLocal( inputFileLower ); 245 getPlatform().copyFromLocal( inputFileUpper ); 246 getPlatform().copyFromLocal( inputFileLowerOffset ); 247 248 Tap sourceLower = getPlatform().getTextFile( inputFileLower ); 249 Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); 250 Tap sourceLowerOffset = getPlatform().getTextFile( inputFileLowerOffset ); 251 252 Map sources = new HashMap(); 253 254 sources.put( "lower", sourceLower ); 255 sources.put( "upper", sourceUpper ); 256 sources.put( "offset", sourceLowerOffset ); 257 258 Tap sink = getPlatform().getDelimitedFile( Fields.ALL, "\t", getOutputPath( "simplemergethree" ), SinkMode.REPLACE ); 259 260 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 261 262 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 263 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 264 Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter ); 265 266 Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper, pipeOffset ), new Fields( "num" ), new Fields( "char" ) ); 267 268 splice = new Every( splice, new Fields( "char" ), new First( new Fields( "first" ) ) ); 269 270 splice = new Each( splice, new Fields( "num", "first" ), new Identity() ); 271 272 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 273 274 flow.complete(); 275 276 validateLength( flow, 6 ); 277 278 List<Tuple> tuples = getSinkAsList( flow ); 279 280 assertTrue( tuples.contains( new Tuple( "1", "A" ) ) ); 281 assertTrue( tuples.contains( new Tuple( "2", "B" ) ) ); 282 assertTrue( tuples.contains( new Tuple( "3", "C" ) ) ); 283 assertTrue( tuples.contains( new Tuple( "4", "D" ) ) ); 284 assertTrue( tuples.contains( new Tuple( "5", "E" ) ) ); 285 assertTrue( tuples.contains( new Tuple( "6", "c" ) ) ); 286 } 287 288 @Test 289 public void testSameSourceMerge() throws Exception 290 { 291 getPlatform().copyFromLocal( inputFileLower ); 292 293 Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); 294 295 Map sources = new HashMap(); 296 297 sources.put( "lower", sourceLower ); 298 sources.put( "upper", sourceLower ); 299 300 // using null pos so all fields are written 301 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath(), SinkMode.REPLACE ); 302 303 Pipe pipeLower = new Pipe( "lower" ); 304 Pipe pipeUpper = new Pipe( "upper" ); 305 306 Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ), null, false ); 307 308 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 309 310 flow.complete(); 311 312 validateLength( flow, 10 ); 313 314 Collection results = getSinkAsList( flow ); 315 316 assertEquals( "missing value", 2, Collections.frequency( results, new Tuple( "1\ta" ) ) ); 317 assertEquals( "missing value", 2, Collections.frequency( results, new Tuple( "2\tb" ) ) ); 318 assertEquals( "missing value", 2, Collections.frequency( results, new Tuple( "3\tc" ) ) ); 319 } 320 321 /** 322 * same test as MergePipesTest, but to test that chained groupby don't exhibit similar failures 323 * 324 * @throws Exception 325 */ 326 @Test 327 public void testSameSourceMergeThreeChainGroup() throws Exception 328 { 329 getPlatform().copyFromLocal( inputFileLower ); 330 331 Tap sourceLower = getPlatform().getTextFile( inputFileLower ); 332 333 Map sources = new HashMap(); 334 335 sources.put( "split", sourceLower ); 336 337 Tap sink = getPlatform().getTextFile( getOutputPath( "samemergethreechaingroup" ), SinkMode.REPLACE ); 338 339 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 340 341 Pipe pipe = new Pipe( "split" ); 342 343 Pipe pipeLower = new Each( new Pipe( "lower", pipe ), new Fields( "line" ), splitter ); 344 Pipe pipeUpper = new Each( new Pipe( "upper", pipe ), new Fields( "line" ), splitter ); 345 Pipe pipeOffset = new Each( new Pipe( "offset", pipe ), new Fields( "line" ), splitter ); 346 347 //put group before merge to test path counts 348 Pipe splice = new GroupBy( Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ) ); 349 350 // this group has its incoming paths counted, gated by the previous group 351 splice = new GroupBy( Pipe.pipes( splice, pipeOffset ), new Fields( "num" ) ); 352 353 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 354 355 if( getPlatform().isMapReduce() ) 356 assertEquals( "wrong num jobs", 2, flow.getFlowSteps().size() ); 357 358 flow.complete(); 359 360 validateLength( flow, 15 ); 361 } 362 363 @Test 364 public void testUnGroup() throws Exception 365 { 366 getPlatform().copyFromLocal( inputFileJoined ); 367 368 Tap source = getPlatform().getTextFile( inputFileJoined ); 369 Tap sink = getPlatform().getTextFile( getOutputPath( "ungrouped" ), SinkMode.REPLACE ); 370 371 Pipe pipe = new Pipe( "test" ); 372 373 pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ) ) ); 374 375 pipe = new Each( pipe, new UnGroup( new Fields( "num", "char" ), new Fields( "num" ), Fields.fields( new Fields( "lower" ), new Fields( "upper" ) ) ) ); 376 377 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 378 379 flow.complete(); 380 381 validateLength( flow, 10 ); 382 } 383 384 @Test 385 public void testUnGroupAnon() throws Exception 386 { 387 getPlatform().copyFromLocal( inputFileJoined ); 388 389 Tap source = getPlatform().getTextFile( inputFileJoined ); 390 Tap sink = getPlatform().getTextFile( getOutputPath( "ungroupedanon" ), SinkMode.REPLACE ); 391 392 Pipe pipe = new Pipe( "test" ); 393 394 pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ) ) ); 395 396 pipe = new Each( pipe, new UnGroup( new Fields( "num" ), Fields.fields( new Fields( "lower" ), new Fields( "upper" ) ) ) ); 397 398 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 399 400 flow.complete(); 401 402 validateLength( flow, 10 ); 403 } 404 405 @Test 406 public void testUnGroupBySize() throws Exception 407 { 408 getPlatform().copyFromLocal( inputFileJoinedExtra ); 409 410 Tap source = getPlatform().getTextFile( inputFileJoinedExtra ); 411 Tap sink = getPlatform().getTextFile( getOutputPath( "ungrouped_size" ), SinkMode.REPLACE ); 412 413 Pipe pipe = new Pipe( "test" ); 414 415 pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num1", "num2", "lower", "upper" ) ) ); 416 417 pipe = new Each( pipe, new UnGroup( new Fields( "num1", "num2", "char" ), new Fields( "num1", "num2" ), 1 ) ); 418 419 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 420 421 flow.complete(); 422 423 List<Tuple> tuples = asList( flow, sink ); 424 assertEquals( 10, tuples.size() ); 425 426 List<Object> values = new ArrayList<Object>(); 427 for( Tuple tuple : tuples ) 428 values.add( tuple.getObject( 1 ) ); 429 430 assertTrue( values.contains( "1\t1\ta" ) ); 431 assertTrue( values.contains( "1\t1\tA" ) ); 432 assertTrue( values.contains( "2\t2\tb" ) ); 433 assertTrue( values.contains( "2\t2\tB" ) ); 434 assertTrue( values.contains( "3\t3\tc" ) ); 435 assertTrue( values.contains( "3\t3\tC" ) ); 436 assertTrue( values.contains( "4\t4\td" ) ); 437 assertTrue( values.contains( "4\t4\tD" ) ); 438 assertTrue( values.contains( "5\t5\te" ) ); 439 assertTrue( values.contains( "5\t5\tE" ) ); 440 } 441 442 @Test 443 public void testFilter() throws Exception 444 { 445 getPlatform().copyFromLocal( inputFileApache ); 446 447 Tap source = getPlatform().getTextFile( inputFileApache ); 448 Tap sink = getPlatform().getTextFile( getOutputPath( "filter" ), SinkMode.REPLACE ); 449 450 Pipe pipe = new Pipe( "test" ); 451 452 Filter filter = new RegexFilter( "^68.*" ); 453 454 pipe = new Each( pipe, new Fields( "line" ), filter ); 455 456 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 457 458 flow.complete(); 459 460 validateLength( flow, 3 ); 461 } 462 463 @Test 464 public void testLogicFilter() throws Exception 465 { 466 getPlatform().copyFromLocal( inputFileApache ); 467 468 Tap source = getPlatform().getTextFile( inputFileApache ); 469 Tap sink = getPlatform().getTextFile( getOutputPath( "logicfilter" ), SinkMode.REPLACE ); 470 471 Pipe pipe = new Pipe( "test" ); 472 473 Filter filter = new And( new RegexFilter( "^68.*$" ), new RegexFilter( "^1000.*$" ) ); 474 475 pipe = new Each( pipe, new Fields( "line" ), filter ); 476 477 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 478 479 flow.complete(); 480 481 validateLength( flow, 3 ); 482 } 483 484 @Test 485 public void testFilterComplex() throws Exception 486 { 487 getPlatform().copyFromLocal( inputFileApache ); 488 489 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 490 Tap sink = getPlatform().getTextFile( getOutputPath( "filtercomplex" ), SinkMode.REPLACE ); 491 492 Pipe pipe = new Pipe( "test" ); 493 494 pipe = new Each( pipe, new Fields( "line" ), TestConstants.APACHE_COMMON_PARSER ); 495 496 pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) ); 497 pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) ); 498 499 pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL ); 500 501 pipe = new GroupBy( pipe, new Fields( "value" ) ); 502 503 pipe = new Every( pipe, new Count(), new Fields( "value", "count" ) ); 504 505 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 506 507 flow.complete(); 508 509 validateLength( flow, 1, null ); 510 } 511 512 /** 513 * Intentionally filters all values out to test next mr job behaves 514 * 515 * @throws Exception 516 */ 517 @Test 518 public void testFilterAll() throws Exception 519 { 520 getPlatform().copyFromLocal( inputFileApache ); 521 522 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 523 Tap sink = getPlatform().getTextFile( getOutputPath( "filterall" ), SinkMode.REPLACE ); 524 525 Pipe pipe = new Pipe( "test" ); 526 527 String regex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$"; 528 Fields fieldDeclaration = new Fields( "ip", "time", "method", "event", "status", "size" ); 529 int[] groups = {1, 2, 3, 4, 5, 6}; 530 RegexParser function = new RegexParser( fieldDeclaration, regex, groups ); 531 pipe = new Each( pipe, new Fields( "line" ), function ); 532 533 pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all 534 535 pipe = new GroupBy( pipe, new Fields( "method" ) ); 536 537 pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL ); 538 539 pipe = new GroupBy( pipe, new Fields( "value" ) ); 540 541 pipe = new Every( pipe, new Count(), new Fields( "value", "count" ) ); 542 543 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 544 545 flow.complete(); 546 547 validateLength( flow, 0, null ); 548 } 549 550// public void testLimitFilter() throws Exception 551// { 552// copyFromLocal( inputFileApache ); 553// 554// Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); 555// Tap sink = new Lfs( new TextLine(), outputPath + "/limitfilter", true ); 556// 557// Pipe pipe = new Pipe( "test" ); 558// 559// Filter filter = new Limit( 7 ); 560// 561// pipe = new Each( pipe, new Fields( "line" ), filter ); 562// 563// Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); 564// 565//// flow.writeDOT( "flow.dot" ); 566// 567// flow.complete(); 568// 569// validateLength( flow, 7, null ); 570// } 571 572 // 573 574 /* 575 * 576 * TODO: create (optional) Tez rule to consolidate into a single DAG. currently renders to two DAGs, one for each side 577 * 578 */ 579 @Test 580 public void testSplit() throws Exception 581 { 582 getPlatform().copyFromLocal( inputFileApache ); 583 584 // 46 192 585 586 Tap source = getPlatform().getTextFile( inputFileApache ); 587 Tap sink1 = getPlatform().getTextFile( getOutputPath( "split1" ), SinkMode.REPLACE ); 588 Tap sink2 = getPlatform().getTextFile( getOutputPath( "split2" ), SinkMode.REPLACE ); 589 590 Pipe pipe = new Pipe( "split" ); 591 592 pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); 593 594 Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); 595 Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); 596 597 Map sources = new HashMap(); 598 sources.put( "split", source ); 599 600 Map sinks = new HashMap(); 601 sinks.put( "left", sink1 ); 602 sinks.put( "right", sink2 ); 603 604 Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right ); 605 606 flow.complete(); 607 608 validateLength( flow, 1, "left" ); 609 validateLength( flow, 2, "right" ); 610 } 611 612 /** 613 * verifies non-safe rules apply in the proper place 614 * 615 * @throws Exception 616 */ 617 @Test 618 public void testSplitNonSafe() throws Exception 619 { 620 getPlatform().copyFromLocal( inputFileApache ); 621 622 // 46 192 623 624 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 625 Tap sink1 = getPlatform().getTextFile( getOutputPath( "nonsafesplit1" ), SinkMode.REPLACE ); 626 Tap sink2 = getPlatform().getTextFile( getOutputPath( "nonsafesplit2" ), SinkMode.REPLACE ); 627 628 Pipe pipe = new Pipe( "split" ); 629 630 // run job on non-safe operation, forces 3 mr jobs. 631 pipe = new Each( pipe, new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) ); 632 633 pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); 634 635 Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); 636 Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); 637 638 Map sources = new HashMap(); 639 sources.put( "split", source ); 640 641 Map sinks = new HashMap(); 642 sinks.put( "left", sink1 ); 643 sinks.put( "right", sink2 ); 644 645 Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right ); 646 647 flow.complete(); 648 649 validateLength( flow, 1, "left" ); 650 validateLength( flow, 2, "right" ); 651 } 652 653 @Test 654 public void testSplitSameSourceMerged() throws Exception 655 { 656 getPlatform().copyFromLocal( inputFileApache ); 657 658 // 46 192 659 660 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 661 Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemerged" ), SinkMode.REPLACE ); 662 663 Pipe pipe = new Pipe( "split" ); 664 665 pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); 666 667 Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) ); 668 Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) ); 669 670 Pipe merged = new GroupBy( "merged", Pipe.pipes( left, right ), new Fields( "line" ) ); 671 672 Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged ); 673 674 flow.complete(); 675 676 validateLength( flow, 3 ); 677 } 678 679 /** 680 * verifies not inserting Identity between groups works 681 * 682 * @throws Exception 683 */ 684 @Test 685 public void testSplitOut() throws Exception 686 { 687 getPlatform().copyFromLocal( inputFileApache ); 688 689 Tap sourceLower = getPlatform().getTextFile( new Fields( "num", "line" ), inputFileApache ); 690 691 Map sources = new HashMap(); 692 693 sources.put( "lower1", sourceLower ); 694 695 // using null pos so all fields are written 696 Tap sink1 = getPlatform().getTextFile( getOutputPath( "splitout1" ), SinkMode.REPLACE ); 697 Tap sink2 = getPlatform().getTextFile( getOutputPath( "splitout2" ), SinkMode.REPLACE ); 698 699 Map sinks = new HashMap(); 700 701 sinks.put( "output1", sink1 ); 702 sinks.put( "output2", sink2 ); 703 704 Pipe pipeLower1 = new Pipe( "lower1" ); 705 706 Pipe left = new GroupBy( "output1", pipeLower1, new Fields( 0 ) ); 707 Pipe right = new GroupBy( "output2", left, new Fields( 0 ) ); 708 709 Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) ); 710 711// flow.writeDOT( "spit.dot" ); 712 713 flow.complete(); 714 715 validateLength( flow, 10, "output1" ); 716 validateLength( flow, 10, "output2" ); 717 718 assertEquals( 10, asSet( flow, sink1 ).size() ); 719 assertEquals( 10, asSet( flow, sink2 ).size() ); 720 } 721 722 @Test 723 public void testSplitComplex() throws Exception 724 { 725 getPlatform().copyFromLocal( inputFileApache ); 726 727 // 46 192 728 729 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 730 Tap sink1 = getPlatform().getTextFile( getOutputPath( "splitcomp1" ), SinkMode.REPLACE ); 731 Tap sink2 = getPlatform().getTextFile( getOutputPath( "splitcomp2" ), SinkMode.REPLACE ); 732 733 Pipe pipe = new Pipe( "split" ); 734 735 pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); 736 737 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 738 739 pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) ); 740 741 pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) ); 742 743 Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) ); 744 745 Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "ip" ), new RegexFilter( ".*102.*" ) ); 746 747 Map sources = Cascades.tapsMap( "split", source ); 748 Map sinks = Cascades.tapsMap( Pipe.pipes( left, right ), Tap.taps( sink1, sink2 ) ); 749 750 Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, right ); 751 752 flow.complete(); 753 754 validateLength( flow, 1, "left" ); 755 validateLength( flow, 1, "right" ); 756 } 757 758 @Test 759 public void testSplitMultiple() throws Exception 760 { 761 getPlatform().copyFromLocal( inputFileApache ); 762 763 // 46 192 764 765 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 766 Tap sinkLeft = getPlatform().getTextFile( getOutputPath( "left" ), SinkMode.REPLACE ); 767 Tap sinkRightLeft = getPlatform().getTextFile( getOutputPath( "rightleft" ), SinkMode.REPLACE ); 768 Tap sinkRightRight = getPlatform().getTextFile( getOutputPath( "rightright" ), SinkMode.REPLACE ); 769 770 Pipe head = new Pipe( "split" ); 771 772 head = new Each( head, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); 773 774 head = new GroupBy( head, new Fields( "ip" ) ); 775 776 head = new Every( head, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) ); 777 778 head = new Each( head, new Fields( "ip" ), new RegexFilter( "^68.*" ) ); 779 780 Pipe left = new Each( new Pipe( "left", head ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) ); 781 782 Pipe right = new Each( new Pipe( "right", head ), new Fields( "ip" ), new RegexFilter( ".*102.*" ) ); 783 784 right = new GroupBy( right, new Fields( "ip" ) ); 785 786 Pipe rightLeft = new Each( new Pipe( "rightLeft", right ), new Fields( "ip" ), new Identity() ); 787 788 Pipe rightRight = new Each( new Pipe( "rightRight", right ), new Fields( "ip" ), new Identity() ); 789 790 Map sources = Cascades.tapsMap( "split", source ); 791 Map sinks = Cascades.tapsMap( Pipe.pipes( left, rightLeft, rightRight ), Tap.taps( sinkLeft, sinkRightLeft, sinkRightRight ) ); 792 793 Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, left, rightLeft, rightRight ); 794 795 flow.complete(); 796 797 validateLength( flow, 1, "left" ); 798 validateLength( flow, 1, "rightLeft" ); 799 validateLength( flow, 1, "rightRight" ); 800 } 801 802 @Test 803 public void testConcatenation() throws Exception 804 { 805 getPlatform().copyFromLocal( inputFileLower ); 806 getPlatform().copyFromLocal( inputFileUpper ); 807 808 Tap sourceLower = getPlatform().getTextFile( inputFileLower ); 809 Tap sourceUpper = getPlatform().getTextFile( inputFileUpper ); 810 811 Tap source = new MultiSourceTap( sourceLower, sourceUpper ); 812 813 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 814 815 // using null pos so all fields are written 816 Tap sink = getPlatform().getTextFile( getOutputPath( "complexconcat" ), SinkMode.REPLACE ); 817 818 Pipe pipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); 819 820 Pipe splice = new GroupBy( pipe, new Fields( "num" ) ); 821 822 Flow countFlow = getPlatform().getFlowConnector().connect( source, sink, splice ); 823 824 countFlow.complete(); 825 826 validateLength( countFlow, 10, null ); 827 } 828 829 @Test 830 public void testGeneratorAggregator() throws Exception 831 { 832 getPlatform().copyFromLocal( inputFileApache ); 833 834 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 835 836 Pipe pipe = new Pipe( "test" ); 837 838 pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); 839 840 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 841 842 pipe = new Every( pipe, new TestAggregator( new Fields( "count1" ), new Fields( "ip" ), new Tuple( "first1" ), new Tuple( "first2" ) ) ); 843 pipe = new Every( pipe, new TestAggregator( new Fields( "count2" ), new Fields( "ip" ), new Tuple( "second" ), new Tuple( "second2" ), new Tuple( "second3" ) ) ); 844 845 Tap sink = getPlatform().getTextFile( getOutputPath( "generatoraggregator" ), SinkMode.REPLACE ); 846 847 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 848 849 flow.complete(); 850 851 validateLength( flow, 8 * 2 * 3, null ); 852 } 853 854 @Test 855 public void testReplace() throws Exception 856 { 857 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 858 Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "offset", "line" ), getOutputPath( "replace" ), SinkMode.REPLACE ); 859 860 Pipe pipe = new Pipe( "test" ); 861 862 Function parser = new RegexParser( new Fields( 0 ), "^[^ ]*" ); 863 pipe = new Each( pipe, new Fields( "line" ), parser, Fields.REPLACE ); 864 pipe = new Each( pipe, new Fields( "line" ), new Identity( Fields.ARGS ), Fields.REPLACE ); 865 pipe = new Each( pipe, new Fields( "line" ), new Identity( new Fields( "line" ) ), Fields.REPLACE ); 866 867 pipe = new Each( pipe, new Debug( true ) ); 868 869 Flow flow = getPlatform().getFlowConnector( disableDebug() ).connect( source, sink, pipe ); 870 871 flow.complete(); 872 873 validateLength( flow, 10, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); 874 } 875 876 @Test 877 public void testSwap() throws Exception 878 { 879 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 880 Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ipaddress" ), getOutputPath( "swap" ), SinkMode.REPLACE ); 881 882 Pipe pipe = new Pipe( "test" ); 883 884 Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" ); 885 pipe = new Each( pipe, new Fields( "line" ), parser, Fields.SWAP ); 886 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 887 pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) ); 888 pipe = new Each( pipe, new Fields( "ip" ), new Identity( new Fields( "ipaddress" ) ), Fields.SWAP ); 889 890 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 891 892 flow.complete(); 893 894 validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); 895 } 896 897 @Test 898 public void testNone() throws Exception 899 { 900 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 901 Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ip" ), getOutputPath( "none" ), SinkMode.REPLACE ); 902 903 Pipe pipe = new Pipe( "test" ); 904 905 Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" ); 906 pipe = new Each( pipe, new Fields( "line" ), parser, Fields.ALL ); 907 pipe = new Each( pipe, new Fields( "line" ), new NoOp(), Fields.SWAP ); // declares Fields.NONE 908 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 909 pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) ); 910 pipe = new Each( pipe, Fields.NONE, new Insert( new Fields( "ipaddress" ), "1.2.3.4" ), Fields.ALL ); 911 912 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 913 914 flow.complete(); 915 916 validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); 917 } 918 919 /** 920 * this tests a merge on two pipes with the same source and name. 921 * 922 * @throws Exception 923 */ 924 @Test 925 public void testSplitSameSourceMergedSameName() throws Exception 926 { 927 getPlatform().copyFromLocal( inputFileApache ); 928 929 // 46 192 930 931 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 932 Tap sink = getPlatform().getTextFile( getOutputPath( "splitsourcemergedsamename" ), SinkMode.REPLACE ); 933 934 Pipe pipe = new Pipe( "split" ); 935 936 pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) ); 937 938 Pipe left = new Each( pipe, new Fields( "line" ), new RegexFilter( ".*46.*" ) ); 939 Pipe right = new Each( pipe, new Fields( "line" ), new RegexFilter( ".*102.*" ) ); 940 941 Pipe merged = new GroupBy( "merged", Pipe.pipes( left, right ), new Fields( "line" ) ); 942 943 Flow flow = getPlatform().getFlowConnector().connect( source, sink, merged ); 944 945 flow.complete(); 946 947 validateLength( flow, 3 ); 948 } 949 950 /** 951 * Catches failure to properly resolve the grouping fields as incoming to the second group-by 952 * 953 * @throws Exception 954 */ 955 @Test 956 public void testGroupGroup() throws Exception 957 { 958 getPlatform().copyFromLocal( inputFileApache ); 959 960 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); 961 962 Pipe pipe = new Pipe( "test" ); 963 964 pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip", String.class ), "^[^ ]*" ), new Fields( "ip" ) ); 965 966 pipe = new GroupBy( pipe, new Fields( "ip" ) ); 967 968 pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); 969 970 pipe = new GroupBy( pipe, new Fields( "ip" ), new Fields( "count" ) ); 971 972 Tap sink = getPlatform().getTextFile( getOutputPath( "groupgroup" ), SinkMode.REPLACE ); 973 974 Map<Object, Object> properties = getProperties(); 975 976 properties.put( "cascading.serialization.types.required", "true" ); 977 978 Flow flow = getPlatform().getFlowConnector( properties ).connect( source, sink, pipe ); 979 980 flow.complete(); 981 982 validateLength( flow, 8, null ); 983 } 984 985 public static class LowerComparator implements Comparator<Comparable>, Hasher<Comparable>, Serializable 986 { 987 @Override 988 public int compare( Comparable lhs, Comparable rhs ) 989 { 990 return lhs.toString().toLowerCase().compareTo( rhs.toString().toLowerCase() ); 991 } 992 993 @Override 994 public int hashCode( Comparable value ) 995 { 996 if( value == null ) 997 return 0; 998 999 return value.toString().toLowerCase().hashCode(); 1000 } 1001 } 1002 1003 @Test 1004 public void testGroupByInsensitive() throws Exception 1005 { 1006 getPlatform().copyFromLocal( inputFileLower ); 1007 getPlatform().copyFromLocal( inputFileUpper ); 1008 1009 Tap sourceLower = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); 1010 Tap sourceUpper = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileUpper ); 1011 1012 Map sources = new HashMap(); 1013 1014 sources.put( "lower", sourceLower ); 1015 sources.put( "upper", sourceUpper ); 1016 1017 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "insensitivegrouping" + NONDETERMINISTIC ), SinkMode.REPLACE ); 1018 1019 Pipe pipeLower = new Pipe( "lower" ); 1020 Pipe pipeUpper = new Pipe( "upper" ); 1021 1022 Pipe merge = new Merge( pipeLower, pipeUpper ); 1023 1024 Fields charFields = new Fields( "char" ); 1025 charFields.setComparator( "char", new LowerComparator() ); 1026 1027 Pipe splice = new GroupBy( "groupby", merge, charFields ); 1028 1029 splice = new Every( splice, new Fields( "char" ), new Count() ); 1030 1031 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 1032 1033 flow.complete(); 1034 1035 // we can't guarantee if the grouping key will be upper or lower 1036 validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s2$" ) ); 1037 } 1038 }