001/* 002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved. 003 * 004 * Project and contact information: https://cascading.wensel.net/ 005 * 006 * This file is part of the Cascading project. 007 * 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 */ 020 021package cascading; 022 023import java.io.Serializable; 024import java.util.ArrayList; 025import java.util.Collections; 026import java.util.Comparator; 027import java.util.HashMap; 028import java.util.HashSet; 029import java.util.List; 030import java.util.Map; 031import java.util.Set; 032 033import cascading.flow.Flow; 034import cascading.flow.FlowDef; 035import cascading.flow.FlowStep; 036import cascading.flow.planner.graph.ElementGraph; 037import cascading.operation.Aggregator; 038import cascading.operation.Function; 039import cascading.operation.Identity; 040import cascading.operation.aggregator.Count; 041import cascading.operation.aggregator.First; 042import cascading.operation.expression.ExpressionFunction; 043import cascading.operation.regex.RegexFilter; 044import cascading.operation.regex.RegexSplitter; 045import cascading.pipe.Checkpoint; 046import cascading.pipe.CoGroup; 047import cascading.pipe.Each; 048import cascading.pipe.Every; 049import cascading.pipe.GroupBy; 050import cascading.pipe.HashJoin; 051import cascading.pipe.Merge; 052import cascading.pipe.Pipe; 053import cascading.pipe.assembly.Rename; 054import cascading.pipe.joiner.InnerJoin; 055import cascading.pipe.joiner.Joiner; 056import cascading.pipe.joiner.LeftJoin; 057import cascading.pipe.joiner.MixedJoin; 058import cascading.pipe.joiner.OuterJoin; 059import cascading.pipe.joiner.RightJoin; 060import cascading.tap.SinkMode; 061import cascading.tap.Tap; 062import cascading.tuple.Fields; 063import cascading.tuple.Hasher; 064import cascading.tuple.Tuple; 065import org.junit.Test; 066 067import static data.InputData.*; 068 069public class JoinFieldedPipesPlatformTest extends PlatformTestCase 070 { 071 public JoinFieldedPipesPlatformTest() 072 { 073 super( true, 4, 1 ); // leave cluster testing enabled 074 } 075 076 @Test 077 public void testCross() throws Exception 078 { 079 getPlatform().copyFromLocal( inputFileLhs ); 080 getPlatform().copyFromLocal( inputFileRhs ); 081 082 Map sources = new HashMap(); 083 084 sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) ); 085 sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) ); 086 087 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE ); 088 089 Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); 090 Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); 091 092 Pipe cross = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); 093 094 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross ); 095 096 flow.complete(); 097 098 validateLength( flow, 37, null ); 099 100 List<Tuple> values = getSinkAsList( flow ); 101 102 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 103 assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); 104 } 105 106 @Test 107 public void testJoin() throws Exception 108 { 109 getPlatform().copyFromLocal( inputFileLower ); 110 getPlatform().copyFromLocal( inputFileUpper ); 111 112 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 113 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 114 115 Map sources = new HashMap(); 116 117 sources.put( "lower", sourceLower ); 118 sources.put( "upper", sourceUpper ); 119 120 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); 121 122 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 123 124 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 125 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 126 127 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 128 129 Map<Object, Object> properties = getProperties(); 130 131 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 132 133 flow.complete(); 134 135 validateLength( flow, 5 ); 136 137 List<Tuple> values = getSinkAsList( flow ); 138 139 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 140 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 141 } 142 143 @Test 144 public void testJoinSamePipeName() throws Exception 145 { 146 getPlatform().copyFromLocal( inputFileLower ); 147 getPlatform().copyFromLocal( inputFileUpper ); 148 149 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 150 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 151 152 Map sources = new HashMap(); 153 154 sources.put( "lower", sourceLower ); 155 sources.put( "upper", sourceUpper ); 156 157 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "renamedpipes" ), SinkMode.REPLACE ); 158 159 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 160 161 Pipe pipeLower = new Pipe( "lower" ); 162 Pipe pipeUpper = new Pipe( "upper" ); 163 164 // these pipes will hide the source name, and could cause one to be lost 165 pipeLower = new Pipe( "same", pipeLower ); 166 pipeUpper = new Pipe( "same", pipeUpper ); 167 168 pipeLower = new Each( pipeLower, new Fields( "line" ), splitter ); 169 pipeUpper = new Each( pipeUpper, new Fields( "line" ), splitter ); 170 171// pipeLower = new Each( pipeLower, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) ); 172// pipeUpper = new Each( pipeUpper, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) ); 173 174 pipeLower = new Pipe( "left", pipeLower ); 175 pipeUpper = new Pipe( "right", pipeUpper ); 176 177// pipeLower = new Each( pipeLower, new Debug( true ) ); 178// pipeUpper = new Each( pipeUpper, new Debug( true ) ); 179 180 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 181 182// splice = new Each( splice, new Debug( true ) ); 183 splice = new Pipe( "splice", splice ); 184 splice = new Pipe( "tail", splice ); 185 186 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 187 188 flow.complete(); 189 190 validateLength( flow, 5 ); 191 192 List<Tuple> values = getSinkAsList( flow ); 193 194 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 195 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 196 } 197 198 @Test 199 public void testJoinWithUnknowns() throws Exception 200 { 201 getPlatform().copyFromLocal( inputFileLower ); 202 getPlatform().copyFromLocal( inputFileUpper ); 203 204 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 205 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 206 207 Map sources = new HashMap(); 208 209 sources.put( "lower", sourceLower ); 210 sources.put( "upper", sourceUpper ); 211 212 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "unknown" ), SinkMode.REPLACE ); 213 214 Function splitter = new RegexSplitter( Fields.UNKNOWN, " " ); 215 216 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 217 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 218 219 Pipe splice = new HashJoin( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) ); 220 221 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 222 223 flow.complete(); 224 225 validateLength( flow, 5 ); 226 227 List<Tuple> values = getSinkAsList( flow ); 228 229 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 230 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 231 } 232 233 /** 234 * this test intentionally filters out all values so the intermediate tap is empty. this tap is cogrouped with 235 * a new stream using an outerjoin. 236 * 237 * @throws Exception 238 */ 239 @Test 240 public void testJoinFilteredBranch() throws Exception 241 { 242 getPlatform().copyFromLocal( inputFileLower ); 243 getPlatform().copyFromLocal( inputFileUpper ); 244 245 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 246 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 247 248 Map sources = new HashMap(); 249 250 sources.put( "lower", sourceLower ); 251 sources.put( "upper", sourceUpper ); 252 253 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinfilteredbranch" ), SinkMode.REPLACE ); 254 255 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 256 257 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 258 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 259 pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all 260 pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) ); 261 262 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() ); 263 264 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 265 266 flow.complete(); 267 268 validateLength( flow, 5 ); 269 270 List<Tuple> values = getSinkAsList( flow ); 271 272 assertTrue( values.contains( new Tuple( "1\ta\tnull\tnull" ) ) ); 273 assertTrue( values.contains( new Tuple( "2\tb\tnull\tnull" ) ) ); 274 } 275 276 @Test 277 public void testJoinSelf() throws Exception 278 { 279 getPlatform().copyFromLocal( inputFileLhs ); 280 281 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 282 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 283 284 Map sources = new HashMap(); 285 286 sources.put( "lhs", sourceLhs ); 287 sources.put( "rhs", sourceRhs ); 288 289 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinself" ), SinkMode.REPLACE ); 290 291 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 292 293 Pipe pipeLower = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 294 Pipe pipeUpper = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 295 296 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 297 298 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 299 300 flow.complete(); 301 302 validateLength( flow, 37 ); 303 304 List<Tuple> values = getSinkAsList( flow ); 305 306 assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) ); 307 assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) ); 308 } 309 310 @Test 311 public void testSameSourceJoin() throws Exception 312 { 313 getPlatform().copyFromLocal( inputFileLhs ); 314 315 Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLhs ); 316 317 Map sources = new HashMap(); 318 319 sources.put( "lhs", source ); 320 sources.put( "rhs", source ); 321 322 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath(), SinkMode.REPLACE ); 323 324 Pipe pipeLower = new Pipe( "lhs" ); 325 Pipe pipeUpper = new Pipe( "rhs" ); 326 327 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 328 329 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 330 331 flow.complete(); 332 333 validateLength( flow, 37 ); 334 335 List<Tuple> values = getSinkAsList( flow ); 336 337 assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) ); 338 assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) ); 339 } 340 341 /** 342 * Method testCoGroupAfterEvery tests that a tmp tap is inserted after the Every in the cogroup join 343 * 344 * @throws Exception when 345 */ 346 @Test 347 public void testJoinAfterEvery() throws Exception 348 { 349 getPlatform().copyFromLocal( inputFileLower ); 350 getPlatform().copyFromLocal( inputFileUpper ); 351 352 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 353 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 354 355 Map sources = new HashMap(); 356 357 sources.put( "lower", sourceLower ); 358 sources.put( "upper", sourceUpper ); 359 360 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "afterevery" ), SinkMode.REPLACE ); 361 362 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 363 364 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 365 pipeLower = new GroupBy( pipeLower, new Fields( "num" ) ); 366 pipeLower = new Every( pipeLower, new Fields( "char" ), new First(), Fields.ALL ); 367 368 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 369 pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) ); 370 pipeUpper = new Every( pipeUpper, new Fields( "char" ), new First(), Fields.ALL ); 371 372 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 373 374 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 375 376 flow.complete(); 377 378 validateLength( flow, 5, null ); 379 380 List<Tuple> values = getSinkAsList( flow ); 381 382 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 383 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 384 } 385 386 @Test 387 public void testJoinInnerSingleField() throws Exception 388 { 389 getPlatform().copyFromLocal( inputFileLowerOffset ); 390 getPlatform().copyFromLocal( inputFileUpper ); 391 392 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset ); 393 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 394 395 Map sources = new HashMap(); 396 397 sources.put( "lower", sourceLower ); 398 sources.put( "upper", sourceUpper ); 399 400 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joininnersingle" ), SinkMode.REPLACE ); 401 402 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char" ), " " ), new Fields( "num1" ) ); 403 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char" ), " " ), new Fields( "num2" ) ); 404 405 Pipe join = new HashJoin( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) ); 406 407 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join ); 408 409 flow.complete(); 410 411 validateLength( flow, 3, null ); 412 413 Set<Tuple> results = new HashSet<Tuple>(); 414 415 results.add( new Tuple( "1\t1" ) ); 416 results.add( new Tuple( "5\t5" ) ); 417 418 List<Tuple> actual = getSinkAsList( flow ); 419 420 results.removeAll( actual ); 421 422 assertEquals( 0, results.size() ); 423 } 424 425 /** 426 * 1 a1 427 * 1 a2 428 * 1 a3 429 * 2 b1 430 * 3 c1 431 * 4 d1 432 * 4 d2 433 * 4 d3 434 * 5 e1 435 * 5 e2 436 * 5 e3 437 * 7 g1 438 * 7 g2 439 * 7 g3 440 * 7 g4 441 * 7 g5 442 * null h1 443 * <p> 444 * 1 A1 445 * 1 A2 446 * 1 A3 447 * 2 B1 448 * 2 B2 449 * 2 B3 450 * 4 D1 451 * 6 F1 452 * 6 F2 453 * null H1 454 * <p> 455 * 1 a1 1 A1 456 * 1 a1 1 A2 457 * 1 a1 1 A3 458 * 1 a2 1 A1 459 * 1 a2 1 A2 460 * 1 a2 1 A3 461 * 1 a3 1 A1 462 * 1 a3 1 A2 463 * 1 a3 1 A3 464 * 2 b1 2 B1 465 * 2 b1 2 B2 466 * 2 b1 2 B3 467 * 4 d1 4 D1 468 * 4 d2 4 D1 469 * 4 d3 4 D1 470 * null h1 null H1 471 * 472 * @throws Exception 473 */ 474 @Test 475 public void testJoinInner() throws Exception 476 { 477 HashSet<Tuple> results = new HashSet<Tuple>(); 478 479 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 480 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 481 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 482 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 483 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 484 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 485 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 486 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 487 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 488 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 489 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 490 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 491 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 492 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 493 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 494 results.add( new Tuple( null, "h1", null, "H1" ) ); 495 496 handleJoins( "joininner", new InnerJoin(), results ); 497 } 498 499 /** 500 * /** 501 * 1 a1 502 * 1 a2 503 * 1 a3 504 * 2 b1 505 * 3 c1 506 * 4 d1 507 * 4 d2 508 * 4 d3 509 * 5 e1 510 * 5 e2 511 * 5 e3 512 * 7 g1 513 * 7 g2 514 * 7 g3 515 * 7 g4 516 * 7 g5 517 * null h1 518 * <p> 519 * 1 A1 520 * 1 A2 521 * 1 A3 522 * 2 B1 523 * 2 B2 524 * 2 B3 525 * 4 D1 526 * 6 F1 527 * 6 F2 528 * null H1 529 * <p> 530 * 1 a1 1 A1 531 * 1 a1 1 A2 532 * 1 a1 1 A3 533 * 1 a2 1 A1 534 * 1 a2 1 A2 535 * 1 a2 1 A3 536 * 1 a3 1 A1 537 * 1 a3 1 A2 538 * 1 a3 1 A3 539 * 2 b1 2 B1 540 * 2 b1 2 B2 541 * 2 b1 2 B3 542 * 3 c1 null null 543 * 4 d1 4 D1 544 * 4 d2 4 D1 545 * 4 d3 4 D1 546 * 5 e1 null null 547 * 5 e2 null null 548 * 5 e3 null null 549 * null null 6 F1 550 * null null 6 F2 551 * 7 g1 null null 552 * 7 g2 null null 553 * 7 g3 null null 554 * 7 g4 null null 555 * 7 g5 null null 556 * null h1 null H1 557 * 558 * @throws Exception 559 */ 560 @Test 561 public void testJoinOuter() throws Exception 562 { 563 // skip if hadoop cluster mode, outer joins don't behave the same 564 if( getPlatform().isMapReduce() && getPlatform().isUseCluster() ) 565 return; 566 567 Set<Tuple> results = new HashSet<Tuple>(); 568 569 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 570 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 571 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 572 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 573 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 574 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 575 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 576 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 577 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 578 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 579 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 580 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 581 results.add( new Tuple( "3", "c1", null, null ) ); 582 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 583 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 584 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 585 results.add( new Tuple( "5", "e1", null, null ) ); 586 results.add( new Tuple( "5", "e2", null, null ) ); 587 results.add( new Tuple( "5", "e3", null, null ) ); 588 results.add( new Tuple( null, null, "6", "F1" ) ); 589 results.add( new Tuple( null, null, "6", "F2" ) ); 590 results.add( new Tuple( "7", "g1", null, null ) ); 591 results.add( new Tuple( "7", "g2", null, null ) ); 592 results.add( new Tuple( "7", "g3", null, null ) ); 593 results.add( new Tuple( "7", "g4", null, null ) ); 594 results.add( new Tuple( "7", "g5", null, null ) ); 595 results.add( new Tuple( null, "h1", null, "H1" ) ); 596 597 handleJoins( "joinouter", new OuterJoin(), results ); 598 } 599 600 /** 601 * 1 a1 602 * 1 a2 603 * 1 a3 604 * 2 b1 605 * 3 c1 606 * 4 d1 607 * 4 d2 608 * 4 d3 609 * 5 e1 610 * 5 e2 611 * 5 e3 612 * 7 g1 613 * 7 g2 614 * 7 g3 615 * 7 g4 616 * 7 g5 617 * null h1 618 * <p> 619 * 1 A1 620 * 1 A2 621 * 1 A3 622 * 2 B1 623 * 2 B2 624 * 2 B3 625 * 4 D1 626 * 6 F1 627 * 6 F2 628 * null H1 629 * <p> 630 * 1 a1 1 A1 631 * 1 a1 1 A2 632 * 1 a1 1 A3 633 * 1 a2 1 A1 634 * 1 a2 1 A2 635 * 1 a2 1 A3 636 * 1 a3 1 A1 637 * 1 a3 1 A2 638 * 1 a3 1 A3 639 * 2 b1 2 B1 640 * 2 b1 2 B2 641 * 2 b1 2 B3 642 * 3 c1 null null 643 * 4 d1 4 D1 644 * 4 d2 4 D1 645 * 4 d3 4 D1 646 * 5 e1 null null 647 * 5 e2 null null 648 * 5 e3 null null 649 * 7 g1 null null 650 * 7 g2 null null 651 * 7 g3 null null 652 * 7 g4 null null 653 * 7 g5 null null 654 * null h1 null H1 655 * 656 * @throws Exception 657 */ 658 @Test 659 public void testJoinInnerOuter() throws Exception 660 { 661 Set<Tuple> results = new HashSet<Tuple>(); 662 663 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 664 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 665 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 666 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 667 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 668 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 669 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 670 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 671 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 672 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 673 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 674 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 675 results.add( new Tuple( "3", "c1", null, null ) ); 676 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 677 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 678 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 679 results.add( new Tuple( "5", "e1", null, null ) ); 680 results.add( new Tuple( "5", "e2", null, null ) ); 681 results.add( new Tuple( "5", "e3", null, null ) ); 682 results.add( new Tuple( "7", "g1", null, null ) ); 683 results.add( new Tuple( "7", "g2", null, null ) ); 684 results.add( new Tuple( "7", "g3", null, null ) ); 685 results.add( new Tuple( "7", "g4", null, null ) ); 686 results.add( new Tuple( "7", "g5", null, null ) ); 687 results.add( new Tuple( null, "h1", null, "H1" ) ); 688 689 handleJoins( "joininnerouter", new LeftJoin(), results ); 690 } 691 692 /** 693 * 1 a1 694 * 1 a2 695 * 1 a3 696 * 2 b1 697 * 3 c1 698 * 4 d1 699 * 4 d2 700 * 4 d3 701 * 5 e1 702 * 5 e2 703 * 5 e3 704 * 7 g1 705 * 7 g2 706 * 7 g3 707 * 7 g4 708 * 7 g5 709 * null h1 710 * <p> 711 * 1 A1 712 * 1 A2 713 * 1 A3 714 * 2 B1 715 * 2 B2 716 * 2 B3 717 * 4 D1 718 * 6 F1 719 * 6 F2 720 * null H1 721 * <p> 722 * 1 a1 1 A1 723 * 1 a1 1 A2 724 * 1 a1 1 A3 725 * 1 a2 1 A1 726 * 1 a2 1 A2 727 * 1 a2 1 A3 728 * 1 a3 1 A1 729 * 1 a3 1 A2 730 * 1 a3 1 A3 731 * 2 b1 2 B1 732 * 2 b1 2 B2 733 * 2 b1 2 B3 734 * 4 d1 4 D1 735 * 4 d2 4 D1 736 * 4 d3 4 D1 737 * null null 6 F1 738 * null null 6 F2 739 * null h1 null H1 740 * 741 * @throws Exception 742 */ 743 @Test 744 public void testJoinOuterInner() throws Exception 745 { 746 // skip if hadoop cluster mode, outer joins don't behave the same 747 if( getPlatform().isMapReduce() && getPlatform().isUseCluster() ) 748 return; 749 750 Set<Tuple> results = new HashSet<Tuple>(); 751 752 results.add( new Tuple( "1", "a1", "1", "A1" ) ); 753 results.add( new Tuple( "1", "a1", "1", "A2" ) ); 754 results.add( new Tuple( "1", "a1", "1", "A3" ) ); 755 results.add( new Tuple( "1", "a2", "1", "A1" ) ); 756 results.add( new Tuple( "1", "a2", "1", "A2" ) ); 757 results.add( new Tuple( "1", "a2", "1", "A3" ) ); 758 results.add( new Tuple( "1", "a3", "1", "A1" ) ); 759 results.add( new Tuple( "1", "a3", "1", "A2" ) ); 760 results.add( new Tuple( "1", "a3", "1", "A3" ) ); 761 results.add( new Tuple( "2", "b1", "2", "B1" ) ); 762 results.add( new Tuple( "2", "b1", "2", "B2" ) ); 763 results.add( new Tuple( "2", "b1", "2", "B3" ) ); 764 results.add( new Tuple( "4", "d1", "4", "D1" ) ); 765 results.add( new Tuple( "4", "d2", "4", "D1" ) ); 766 results.add( new Tuple( "4", "d3", "4", "D1" ) ); 767 results.add( new Tuple( null, null, "6", "F1" ) ); 768 results.add( new Tuple( null, null, "6", "F2" ) ); 769 results.add( new Tuple( null, "h1", null, "H1" ) ); 770 771 handleJoins( "joinouterinner", new RightJoin(), results ); 772 } 773 774 private void handleJoins( String path, Joiner joiner, Set<Tuple> results ) throws Exception 775 { 776 getPlatform().copyFromLocal( inputFileLhsSparse ); 777 getPlatform().copyFromLocal( inputFileRhsSparse ); 778 779 Fields fields = new Fields( "num", "char" ).applyTypes( Integer.class, String.class ); 780 Tap sourceLower = getPlatform().getDelimitedFile( fields, " ", inputFileLhsSparse ); 781 Tap sourceUpper = getPlatform().getDelimitedFile( fields, " ", inputFileRhsSparse ); 782 783 Map sources = new HashMap(); 784 785 sources.put( "lower", sourceLower ); 786 sources.put( "upper", sourceUpper ); 787 788 Tap sink = getPlatform().getDelimitedFile( Fields.size( 4, String.class ), "\t", getOutputPath( path ), SinkMode.REPLACE ); 789 790 Pipe pipeLower = new Pipe( "lower" ); 791 Pipe pipeUpper = new Pipe( "upper" ); 792 793 Fields declaredFields = new Fields( "num", "char", "num2", "char2" ); 794 Fields groupingFields = new Fields( "num" ); 795 796 Pipe splice = new HashJoin( pipeLower, groupingFields, pipeUpper, groupingFields, declaredFields, joiner ); 797 798 splice = new Each( splice, Fields.ALL, new Identity(), Fields.RESULTS ); 799 800 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 801 802 flow.complete(); 803 804 validateLength( flow, results.size() ); 805 806 List<Tuple> actual = getSinkAsList( flow ); 807 808 results.removeAll( actual ); 809 810 assertEquals( 0, results.size() ); 811 } 812 813 /** 814 * 1 a 815 * 5 b 816 * 6 c 817 * 5 b 818 * 5 e 819 * <p> 820 * 1 A 821 * 2 B 822 * 3 C 823 * 4 D 824 * 5 E 825 * <p> 826 * 1 a 827 * 2 b 828 * 3 c 829 * 4 d 830 * 5 e 831 * <p> 832 * 1 a 1 A 1 a 833 * - - 2 B 2 b 834 * - - 3 C 3 c 835 * - - 4 D 4 d 836 * 5 b 5 E 5 e 837 * 5 e 5 E 5 e 838 * 839 * @throws Exception 840 */ 841 @Test 842 public void testJoinMixed() throws Exception 843 { 844 // skip if hadoop cluster mode, outer joins don't behave the same 845 if( getPlatform().isMapReduce() && getPlatform().isUseCluster() ) 846 return; 847 848 getPlatform().copyFromLocal( inputFileLowerOffset ); 849 getPlatform().copyFromLocal( inputFileLower ); 850 getPlatform().copyFromLocal( inputFileUpper ); 851 852 Tap sourceLowerOffset = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset ); 853 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 854 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 855 856 Map sources = new HashMap(); 857 858 sources.put( "loweroffset", sourceLowerOffset ); 859 sources.put( "lower", sourceLower ); 860 sources.put( "upper", sourceUpper ); 861 862 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinmixed" ), SinkMode.REPLACE ); 863 864 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 865 866 Pipe pipeLowerOffset = new Each( new Pipe( "loweroffset" ), new Fields( "line" ), splitter ); 867 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 868 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 869 870 Pipe[] pipes = Pipe.pipes( pipeLowerOffset, pipeUpper, pipeLower ); 871 Fields[] fields = Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) ); 872 873 MixedJoin join = new MixedJoin( new boolean[]{MixedJoin.OUTER, MixedJoin.INNER, MixedJoin.OUTER} ); 874 Pipe splice = new HashJoin( pipes, fields, Fields.size( 6 ), join ); 875 876 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice ); 877 878 flow.complete(); 879 880 validateLength( flow, 6 ); 881 882 Set<Tuple> results = new HashSet<Tuple>(); 883 884 results.add( new Tuple( "1\ta\t1\tA\t1\ta" ) ); 885 results.add( new Tuple( "null\tnull\t2\tB\t2\tb" ) ); 886 results.add( new Tuple( "null\tnull\t3\tC\t3\tc" ) ); 887 results.add( new Tuple( "null\tnull\t4\tD\t4\td" ) ); 888 results.add( new Tuple( "5\tb\t5\tE\t5\te" ) ); 889 results.add( new Tuple( "5\te\t5\tE\t5\te" ) ); 890 891 List<Tuple> actual = getSinkAsList( flow ); 892 893 results.removeAll( actual ); 894 895 assertEquals( 0, results.size() ); 896 } 897 898 @Test 899 public void testJoinDiffFields() throws Exception 900 { 901 getPlatform().copyFromLocal( inputFileLower ); 902 getPlatform().copyFromLocal( inputFileUpper ); 903 904 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 905 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 906 907 Map sources = new HashMap(); 908 909 sources.put( "lower", sourceLower ); 910 sources.put( "upper", sourceUpper ); 911 912 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "difffields" ), SinkMode.REPLACE ); 913 914 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 915 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 916 917 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 918 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 919 920 Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 921 922 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe ); 923 924 flow.complete(); 925 926 validateLength( flow, 5 ); 927 928 List<Tuple> actual = getSinkAsList( flow ); 929 930 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 931 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) ); 932 } 933 934 @Test 935 public void testJoinGroupBy() throws Exception 936 { 937 getPlatform().copyFromLocal( inputFileLower ); 938 getPlatform().copyFromLocal( inputFileUpper ); 939 940 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 941 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 942 943 Map sources = new HashMap(); 944 945 sources.put( "lower", sourceLower ); 946 sources.put( "upper", sourceUpper ); 947 948 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupby" ), SinkMode.REPLACE ); 949 950 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 951 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 952 953 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 954 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 955 956 Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 957 958 Pipe groupby = new GroupBy( pipe, new Fields( "numA" ) ); 959 960 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, groupby ); 961 962 flow.complete(); 963 964 validateLength( flow, 5, null ); 965 966 List<Tuple> actual = getSinkAsList( flow ); 967 968 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 969 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) ); 970 } 971 972 @Test 973 public void testJoinSamePipe() throws Exception 974 { 975 getPlatform().copyFromLocal( inputFileLower ); 976 977 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 978 979 Map sources = new HashMap(); 980 981 sources.put( "lower", source ); 982 983 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe" ), SinkMode.REPLACE ); 984 985 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 986 987 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 988 989 Pipe pipe = new HashJoin( pipeLower, new Fields( "num" ), 1, new Fields( "num1", "char1", "num2", "char2" ) ); 990 991 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe ); 992 993 flow.complete(); 994 995 validateLength( flow, 5, null ); 996 997 List<Tuple> actual = getSinkAsList( flow ); 998 999 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 1000 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 1001 } 1002 1003 @Test 1004 public void testJoinSamePipe2() throws Exception 1005 { 1006 getPlatform().copyFromLocal( inputFileLower ); 1007 1008 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1009 1010 Map sources = new HashMap(); 1011 1012 sources.put( "lower", source ); 1013 1014 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe2" ), SinkMode.REPLACE ); 1015 1016 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1017 1018 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1019 1020 Pipe join = new HashJoin( pipeLower, new Fields( "num" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1021 1022 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join ); 1023 1024 flow.complete(); 1025 1026 validateLength( flow, 5, null ); 1027 1028 List<Tuple> actual = getSinkAsList( flow ); 1029 1030 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 1031 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 1032 } 1033 1034 @Test 1035 public void testJoinSamePipe3() throws Exception 1036 { 1037 getPlatform().copyFromLocal( inputFileLower ); 1038 1039 Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower ); 1040 1041 Map sources = new HashMap(); 1042 1043 sources.put( "lower", source ); 1044 1045 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe3" ), SinkMode.REPLACE ); 1046 1047 Pipe pipe = new Pipe( "lower" ); 1048 1049 Pipe lhs = new Pipe( "lhs", pipe ); 1050 Pipe rhs = new Pipe( "rhs", pipe ); 1051 1052 Pipe join = new HashJoin( lhs, new Fields( "num" ), rhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1053 1054 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join ); 1055 1056 flow.complete(); 1057 1058 validateLength( flow, 5, null ); 1059 1060 List<Tuple> actual = getSinkAsList( flow ); 1061 1062 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 1063 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 1064 } 1065 1066 /** 1067 * Same source as rightmost 1068 * <p> 1069 * should be a single job as the same file accumulates into the joins 1070 * 1071 * @throws Exception 1072 */ 1073 @Test 1074 public void testJoinAroundJoinRightMost() throws Exception 1075 { 1076 getPlatform().copyFromLocal( inputFileLower ); 1077 getPlatform().copyFromLocal( inputFileUpper ); 1078 1079 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1080 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1081 1082 Map sources = new HashMap(); 1083 1084 sources.put( "lower", sourceLower ); 1085 sources.put( "upper1", sourceUpper ); 1086 sources.put( "upper2", sourceUpper ); 1087 1088 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinrightmost" ), SinkMode.REPLACE ); 1089 1090 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1091 1092 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1093 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1094 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1095 1096 Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1097 1098 splice1 = new Each( splice1, new Identity() ); 1099 1100 Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1101 1102 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1103 1104// flow.writeDOT( "joinaroundrightmost.dot" ); 1105 1106 if( getPlatform().isMapReduce() ) 1107 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1108 1109 flow.complete(); 1110 1111 validateLength( flow, 5, null ); 1112 1113 List<Tuple> actual = getSinkAsList( flow ); 1114 1115 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) ); 1116 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) ); 1117 } 1118 1119 /** 1120 * Same source as leftmost 1121 * 1122 * @throws Exception 1123 */ 1124 @Test 1125 public void testJoinAroundJoinLeftMost() throws Exception 1126 { 1127 getPlatform().copyFromLocal( inputFileLower ); 1128 getPlatform().copyFromLocal( inputFileUpper ); 1129 1130 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1131 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1132 1133 Map sources = new HashMap(); 1134 1135 sources.put( "lower", sourceLower ); 1136 sources.put( "upper1", sourceUpper ); 1137 sources.put( "upper2", sourceUpper ); 1138 1139 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinleftmost" ), SinkMode.REPLACE ); 1140 1141 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1142 1143 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1144 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1145 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1146 1147 Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1148 1149 splice1 = new Each( splice1, new Identity() ); 1150 1151 Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1152 1153 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1154 1155// flow.writeDOT( "joinaroundleftmost.dot" ); 1156 1157 if( getPlatform().isMapReduce() ) 1158 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1159 1160 flow.complete(); 1161 1162 validateLength( flow, 5, null ); 1163 1164 List<Tuple> actual = getSinkAsList( flow ); 1165 1166 assertTrue( actual.contains( new Tuple( "1\tA\t1\tA\t1\ta" ) ) ); 1167 assertTrue( actual.contains( new Tuple( "2\tB\t2\tB\t2\tb" ) ) ); 1168 } 1169 1170 /** 1171 * Upper as leftmost and rightmost forcing two jobs 1172 * 1173 * @throws Exception 1174 */ 1175 @Test 1176 public void testJoinAroundJoinRightMostSwapped() throws Exception 1177 { 1178 getPlatform().copyFromLocal( inputFileLower ); 1179 getPlatform().copyFromLocal( inputFileUpper ); 1180 1181 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1182 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1183 1184 Map sources = new HashMap(); 1185 1186 sources.put( "lower", sourceLower ); 1187 sources.put( "upper1", sourceUpper ); 1188 sources.put( "upper2", sourceUpper ); 1189 1190 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinswapped" ), SinkMode.REPLACE ); 1191 1192 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1193 1194 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1195 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1196 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1197 1198 Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1199 1200 splice1 = new Each( splice1, new Identity() ); 1201 1202 // upper2 becomes leftmost, forcing a tap between the joins 1203 Pipe splice2 = new HashJoin( pipeUpper2, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1204 1205 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1206 1207 if( getPlatform().isMapReduce() ) 1208 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1209 1210 flow.complete(); 1211 1212 validateLength( flow, 5, null ); 1213 1214 List<Tuple> actual = getSinkAsList( flow ); 1215 1216 assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\tA" ) ) ); 1217 assertTrue( actual.contains( new Tuple( "2\tB\t2\tb\t2\tB" ) ) ); 1218 } 1219 1220 @Test 1221 public void testJoinGroupByJoin() throws Exception 1222 { 1223 getPlatform().copyFromLocal( inputFileLower ); 1224 getPlatform().copyFromLocal( inputFileUpper ); 1225 getPlatform().copyFromLocal( inputFileJoined ); 1226 1227 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1228 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1229 Tap sourceJoined = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileJoined ); 1230 1231 Map sources = new HashMap(); 1232 1233 sources.put( "lower", sourceLower ); 1234 sources.put( "upper", sourceUpper ); 1235 sources.put( "joined", sourceJoined ); 1236 1237 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupbyjoin" ), SinkMode.REPLACE ); 1238 1239 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 1240 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 1241 Function splitterJoined = new RegexSplitter( new Fields( "numC", "lowerC", "upperC" ), "\t" ); 1242 1243 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 1244 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 1245 Pipe pipeJoined = new Each( new Pipe( "joined" ), new Fields( "line" ), splitterJoined ); 1246 1247 Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 1248 1249 pipe = new GroupBy( pipe, new Fields( "numA" ) ); 1250 1251 pipe = new HashJoin( pipe, new Fields( "numA" ), pipeJoined, new Fields( "numC" ) ); 1252 1253 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe ); 1254 1255 if( getPlatform().isMapReduce() ) 1256 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1257 1258 flow.complete(); 1259 1260 validateLength( flow, 5, null ); 1261 1262 List<Tuple> actual = getSinkAsList( flow ); 1263 1264 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\tA" ) ) ); 1265 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tb\tB" ) ) ); 1266 } 1267 1268 /** 1269 * here the same file is fed into the same HashJoin. 1270 * <p> 1271 * This is three jobs. 1272 * <p> 1273 * a temp tap is inserted before the accumulated branch for two reasons on the common HashJoin 1274 * <p> 1275 * it is assumed the accumulated side is filtered down, so pushing to disk will preserve io 1276 * if accumulated side was streamed instead via a fork, only part of the file will accumulate into the HashJoin 1277 * <p> 1278 * /-T-\ <-- accumulated 1279 * T HJ 1280 * \---/ <-- streamed 1281 * 1282 * @throws Exception 1283 */ 1284 @Test 1285 public void testJoinSameSourceIntoJoin() throws Exception 1286 { 1287 getPlatform().copyFromLocal( inputFileLower ); 1288 getPlatform().copyFromLocal( inputFileUpper ); 1289 1290 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1291 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1292 1293 Map sources = new HashMap(); 1294 1295 sources.put( "lower", sourceLower ); 1296 sources.put( "upper1", sourceUpper ); 1297 sources.put( "upper2", sourceUpper ); 1298 1299 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceintojoin" ), SinkMode.REPLACE ); 1300 1301 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1302 1303 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1304 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1305 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1306 1307 Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1308 1309 splice1 = new Each( splice1, new Identity() ); 1310 1311 Pipe splice2 = new HashJoin( pipeLower, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) ); 1312 1313 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); 1314 1315// flow.writeDOT( "joinsamesourceintojoin.dot" ); 1316 1317 if( getPlatform().isMapReduce() ) 1318 assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() ); 1319 1320 flow.complete(); 1321 1322 validateLength( flow, 5, null ); 1323 1324 List<Tuple> actual = getSinkAsList( flow ); 1325 1326 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) ); 1327 assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) ); 1328 } 1329 1330 @Test 1331 public void testJoinSameSourceIntoJoinSimple() throws Exception 1332 { 1333 getPlatform().copyFromLocal( inputFileUpper ); 1334 1335 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1336 1337 Map sources = new HashMap(); 1338 1339 sources.put( "upper1", sourceUpper ); 1340 sources.put( "upper2", sourceUpper ); 1341 1342 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceintojoinsimple" ), SinkMode.REPLACE ); 1343 1344 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1345 1346 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1347 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1348 1349 Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1350 1351 splice1 = new Each( splice1, new Identity() ); 1352 1353 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice1 ); 1354 1355// flow.writeDOT( "joinsamesourceintojoin.dot" ); 1356 1357 if( getPlatform().isMapReduce() ) 1358 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1359 1360 flow.complete(); 1361 1362 validateLength( flow, 5, null ); 1363 1364 List<Tuple> actual = getSinkAsList( flow ); 1365 1366 assertTrue( actual.contains( new Tuple( "1\tA\t1\tA" ) ) ); 1367 assertTrue( actual.contains( new Tuple( "2\tB\t2\tB" ) ) ); 1368 } 1369 1370 /** 1371 * Loosely tests for a deadlock when BlockingHashJoinAnnotator rule doesn't excluce the GroupBy from the blocking 1372 * annotation. 1373 * <p> 1374 * the deadlock is random on the order of the paths traversed from the Source Tap + fork. 1375 * 1376 * @throws Exception 1377 */ 1378 @Test 1379 public void testJoinSameSourceOverGroupByIntoJoinSimple() throws Exception 1380 { 1381 getPlatform().copyFromLocal( inputFileLower ); 1382 getPlatform().copyFromLocal( inputFileUpper ); 1383 1384 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1385 1386 Map sources = new HashMap(); 1387 1388 sources.put( "upper1", sourceUpper ); 1389 sources.put( "upper2", sourceUpper ); 1390 1391 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceovergroupbyintojoinsimple" ), SinkMode.REPLACE ); 1392 1393 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1394 1395 Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter ); 1396 Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter ); 1397 1398 pipeUpper1 = new GroupBy( pipeUpper1, new Fields( "num" ) ); 1399 pipeUpper2 = new GroupBy( pipeUpper2, new Fields( "num" ) ); 1400 1401 Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1402 1403 splice1 = new Each( splice1, new Identity() ); 1404 1405 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice1 ); 1406 1407 if( getPlatform().isMapReduce() ) 1408 assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() ); 1409 1410 flow.complete(); 1411 1412 validateLength( flow, 5, null ); 1413 1414 List<Tuple> actual = getSinkAsList( flow ); 1415 1416 assertTrue( actual.contains( new Tuple( "1\tA\t1\tA" ) ) ); 1417 assertTrue( actual.contains( new Tuple( "2\tB\t2\tB" ) ) ); 1418 } 1419 1420 /** 1421 * Tests that two independent streamed sources with loadable tributaries properly plan into a GroupBy 1422 * without loading unused sources 1423 * 1424 * @throws Exception 1425 */ 1426 @Test 1427 public void testJoinsIntoGroupBy() throws Exception 1428 { 1429 getPlatform().copyFromLocal( inputFileLower ); 1430 getPlatform().copyFromLocal( inputFileUpper ); 1431 1432 getPlatform().copyFromLocal( inputFileLhs ); 1433 getPlatform().copyFromLocal( inputFileRhs ); 1434 1435 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1436 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1437 1438 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1439 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1440 1441 Map sources = new HashMap(); 1442 1443 sources.put( "lower", sourceLower ); 1444 sources.put( "upper", sourceUpper ); 1445 sources.put( "lhs", sourceLhs ); 1446 sources.put( "rhs", sourceRhs ); 1447 1448 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintogroupby" ), SinkMode.REPLACE ); 1449 1450 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1451 1452 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1453 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1454 1455 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1456 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1457 1458 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1459 1460 upperLower = new Each( upperLower, new Identity() ); 1461 1462 Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1463 1464 lhsRhs = new Each( lhsRhs, new Identity() ); 1465 1466 Pipe grouped = new GroupBy( "merging", Pipe.pipes( upperLower, lhsRhs ), new Fields( "num1" ) ); 1467 1468 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1469 1470 if( getPlatform().isMapReduce() ) 1471 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1472 1473 flow.complete(); 1474 1475 validateLength( flow, 42, null ); 1476 1477 List<Tuple> actual = getSinkAsList( flow ); 1478 1479 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1480 assertTrue( actual.contains( new Tuple( "5\te\t5\tE" ) ) ); 1481 } 1482 1483 @Test 1484 public void testJoinSamePipeAroundGroupBy() throws Exception 1485 { 1486 getPlatform().copyFromLocal( inputFileLower ); 1487 1488 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1489 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipearoundgroupby" ), SinkMode.REPLACE ); 1490 1491 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1492 1493 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1494 1495 Pipe lhsPipe = new Each( new Pipe( "lhs", pipeLower ), new Identity() ); 1496 1497 Pipe rhsPipe = new Each( new Pipe( "rhs", pipeLower ), new Identity() ); 1498 1499 rhsPipe = new GroupBy( rhsPipe, new Fields( "num" ) ); 1500 1501 rhsPipe = new Each( rhsPipe, new Identity() ); 1502 1503 Pipe pipe = new HashJoin( lhsPipe, new Fields( "num" ), rhsPipe, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) ); 1504 1505 Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); 1506 1507 flow.complete(); 1508 1509 validateLength( flow, 5, null ); 1510 1511 List<Tuple> actual = getSinkAsList( flow ); 1512 1513 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) ); 1514 assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) ); 1515 } 1516 1517 /** 1518 * This test results in two MR jobs because one join feeds into the accumulated side of the second. A mapper 1519 * can only stream on branch at a time forcing a temp file between the mappers. see next test for swapped join 1520 * 1521 * @throws Exception 1522 */ 1523 @Test 1524 public void testJoinsIntoCoGroupLhs() throws Exception 1525 { 1526 getPlatform().copyFromLocal( inputFileLower ); 1527 getPlatform().copyFromLocal( inputFileUpper ); 1528 1529 getPlatform().copyFromLocal( inputFileLhs ); 1530 getPlatform().copyFromLocal( inputFileRhs ); 1531 1532 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1533 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1534 1535 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1536 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1537 1538 Map sources = new HashMap(); 1539 1540 sources.put( "lower", sourceLower ); 1541 sources.put( "upper", sourceUpper ); 1542 sources.put( "lhs", sourceLhs ); 1543 sources.put( "rhs", sourceRhs ); 1544 1545 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhs" ), SinkMode.REPLACE ); 1546 1547 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1548 1549 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1550 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1551 1552 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1553 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1554 1555 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1556 1557 upperLower = new Each( upperLower, new Identity() ); 1558 1559 Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1560 1561 lhsUpperLower = new Each( lhsUpperLower, new Identity() ); 1562 1563 Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) ); 1564 1565 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1566 1567 if( getPlatform().isMapReduce() ) 1568 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1569 1570 flow.complete(); 1571 1572 validateLength( flow, 37, null ); 1573 1574 List<Tuple> actual = getSinkAsList( flow ); 1575 1576 assertTrue( actual.contains( new Tuple( "1\ta\t1\ta\t1\tA\t1\tA" ) ) ); 1577 assertTrue( actual.contains( new Tuple( "5\ta\t5\te\t5\tE\t5\tA" ) ) ); 1578 } 1579 1580 /** 1581 * This test results in one MR jobs because one join feeds into the streamed side of the second. 1582 * 1583 * @throws Exception 1584 */ 1585 @Test 1586 public void testJoinsIntoCoGroupLhsSwappedJoin() throws Exception 1587 { 1588 getPlatform().copyFromLocal( inputFileLower ); 1589 getPlatform().copyFromLocal( inputFileUpper ); 1590 1591 getPlatform().copyFromLocal( inputFileLhs ); 1592 getPlatform().copyFromLocal( inputFileRhs ); 1593 1594 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1595 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1596 1597 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1598 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1599 1600 Map sources = new HashMap(); 1601 1602 sources.put( "lower", sourceLower ); 1603 sources.put( "upper", sourceUpper ); 1604 sources.put( "lhs", sourceLhs ); 1605 sources.put( "rhs", sourceRhs ); 1606 1607 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhsswappedjoin" ), SinkMode.REPLACE ); 1608 1609 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1610 1611 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1612 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1613 1614 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1615 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1616 1617 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1618 1619 upperLower = new Each( upperLower, new Identity() ); 1620 1621 Pipe lhsUpperLower = new HashJoin( upperLower, new Fields( "numUpperLower" ), pipeLhs, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower", "numLhs", "charLhs" ) ); 1622 1623 lhsUpperLower = new Each( lhsUpperLower, new Identity() ); 1624 1625 Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) ); 1626 1627 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1628 1629 if( getPlatform().isMapReduce() ) 1630 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1631 1632 flow.complete(); 1633 1634 validateLength( flow, 37, null ); 1635 1636 List<Tuple> actual = getSinkAsList( flow ); 1637 1638 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) ); 1639 assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) ); 1640 } 1641 1642 @Test 1643 public void testJoinsIntoCoGroupRhs() throws Exception 1644 { 1645 getPlatform().copyFromLocal( inputFileLower ); 1646 getPlatform().copyFromLocal( inputFileUpper ); 1647 1648 getPlatform().copyFromLocal( inputFileLhs ); 1649 getPlatform().copyFromLocal( inputFileRhs ); 1650 1651 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1652 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1653 1654 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1655 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1656 1657 Map sources = new HashMap(); 1658 1659 sources.put( "lower", sourceLower ); 1660 sources.put( "upper", sourceUpper ); 1661 sources.put( "lhs", sourceLhs ); 1662 sources.put( "rhs", sourceRhs ); 1663 1664 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouprhs" ), SinkMode.REPLACE ); 1665 1666 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1667 1668 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1669 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1670 1671 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1672 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1673 1674 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1675 1676 upperLower = new Each( upperLower, new Identity() ); 1677 1678 Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) ); 1679 1680 lhsUpperLower = new Each( lhsUpperLower, new Identity() ); 1681 1682 Pipe grouped = new CoGroup( "cogrouping", pipeRhs, new Fields( "num" ), lhsUpperLower, new Fields( "numLhs" ) ); 1683 1684 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1685 1686 if( getPlatform().isMapReduce() ) 1687 assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() ); 1688 1689 flow.complete(); 1690 1691 validateLength( flow, 37, null ); 1692 1693 List<Tuple> actual = getSinkAsList( flow ); 1694 1695 assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\ta\t1\tA" ) ) ); 1696 assertTrue( actual.contains( new Tuple( "5\tE\t5\te\t5\te\t5\tE" ) ) ); 1697 } 1698 1699 @Test 1700 public void testJoinsIntoCoGroup() throws Exception 1701 { 1702 getPlatform().copyFromLocal( inputFileLower ); 1703 getPlatform().copyFromLocal( inputFileUpper ); 1704 1705 getPlatform().copyFromLocal( inputFileLhs ); 1706 getPlatform().copyFromLocal( inputFileRhs ); 1707 1708 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1709 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1710 1711 Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs ); 1712 Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs ); 1713 1714 Map sources = new HashMap(); 1715 1716 sources.put( "lower", sourceLower ); 1717 sources.put( "upper", sourceUpper ); 1718 sources.put( "lhs", sourceLhs ); 1719 sources.put( "rhs", sourceRhs ); 1720 1721 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogroup" ), SinkMode.REPLACE ); 1722 1723 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1724 1725 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1726 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1727 1728 Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter ); 1729 Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter ); 1730 1731 Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower1", "charUpperLower1", "numUpperLower2", "charUpperLower2" ) ); 1732 1733 upperLower = new Each( upperLower, new Identity() ); 1734 1735 Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "numLhsRhs1", "charLhsRhs1", "numLhsRhs2", "charLhsRhs2" ) ); 1736 1737 lhsRhs = new Each( lhsRhs, new Identity() ); 1738 1739 Pipe grouped = new CoGroup( "cogrouping", upperLower, new Fields( "numUpperLower1" ), lhsRhs, new Fields( "numLhsRhs1" ) ); 1740 1741 Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped ); 1742 1743 if( getPlatform().isMapReduce() ) 1744 assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() ); 1745 1746 flow.complete(); 1747 1748 validateLength( flow, 37, null ); 1749 1750 List<Tuple> actual = getSinkAsList( flow ); 1751 1752 assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) ); 1753 assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) ); 1754 } 1755 1756 public static class AllComparator implements Comparator<Comparable>, Hasher<Comparable>, Serializable 1757 { 1758 1759 @Override 1760 public int compare( Comparable lhs, Comparable rhs ) 1761 { 1762 return lhs.toString().compareTo( rhs.toString() ); 1763 } 1764 1765 @Override 1766 public int hashCode( Comparable value ) 1767 { 1768 if( value == null ) 1769 return 0; 1770 1771 return value.toString().hashCode(); 1772 } 1773 } 1774 1775 /** 1776 * Tests Hasher being honored even if default comparator is null. 1777 * 1778 * @throws Exception 1779 */ 1780 @Test 1781 public void testJoinWithHasher() throws Exception 1782 { 1783 getPlatform().copyFromLocal( inputFileLower ); 1784 getPlatform().copyFromLocal( inputFileUpper ); 1785 1786 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1787 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1788 1789 Map sources = new HashMap(); 1790 1791 sources.put( "lower", sourceLower ); 1792 sources.put( "upper", sourceUpper ); 1793 1794 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinhasher" ), SinkMode.REPLACE ); 1795 1796 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1797 1798 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1799 1800 pipeLower = new Each( pipeLower, new Fields( "num" ), new ExpressionFunction( Fields.ARGS, "Integer.parseInt( num )", String.class ), Fields.REPLACE ); 1801 1802 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1803 1804 Fields num = new Fields( "num" ); 1805 num.setComparator( "num", new AllComparator() ); 1806 1807 Pipe splice = new HashJoin( pipeLower, num, pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 1808 1809 Map<Object, Object> properties = getProperties(); 1810 1811 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 1812 1813 flow.complete(); 1814 1815 validateLength( flow, 5 ); 1816 1817 List<Tuple> values = getSinkAsList( flow ); 1818 1819 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1820 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 1821 } 1822 1823 @Test 1824 public void testJoinNone() throws Exception 1825 { 1826 getPlatform().copyFromLocal( inputFileLower ); 1827 getPlatform().copyFromLocal( inputFileUpper ); 1828 1829 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1830 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1831 1832 Map sources = new HashMap(); 1833 1834 sources.put( "lower", sourceLower ); 1835 sources.put( "upper", sourceUpper ); 1836 1837 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinnone" ), SinkMode.REPLACE ); 1838 1839 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 1840 1841 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 1842 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); 1843 1844 Pipe splice = new HashJoin( pipeLower, Fields.NONE, pipeUpper, Fields.NONE, Fields.size( 4 ) ); 1845 1846 Map<Object, Object> properties = getProperties(); 1847 1848 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 1849 1850 flow.complete(); 1851 1852 validateLength( flow, 25 ); 1853 1854 List<Tuple> values = getSinkAsList( flow ); 1855 1856 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1857 assertTrue( values.contains( new Tuple( "1\ta\t2\tB" ) ) ); 1858 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 1859 } 1860 1861 @Test 1862 public void testGroupBySplitJoins() throws Exception 1863 { 1864 getPlatform().copyFromLocal( inputFileLower ); 1865 getPlatform().copyFromLocal( inputFileUpper ); 1866 getPlatform().copyFromLocal( inputFileJoined ); 1867 1868 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 1869 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 1870 Tap sourceJoined = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileJoined ); 1871 1872 Map sources = new HashMap(); 1873 1874 sources.put( "lower", sourceLower ); 1875 sources.put( "upper", sourceUpper ); 1876 sources.put( "joined", sourceJoined ); 1877 1878 Tap lhsSink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ); 1879 Tap rhsSink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ); 1880 1881 Map sinks = new HashMap(); 1882 1883 sinks.put( "lhs", lhsSink ); 1884 sinks.put( "rhs", rhsSink ); 1885 1886 Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " ); 1887 Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " ); 1888 Function splitterJoined = new RegexSplitter( new Fields( "numC", "lowerC", "upperC" ), "\t" ); 1889 1890 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower ); 1891 Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper ); 1892 Pipe pipeJoined = new Each( new Pipe( "joined" ), new Fields( "line" ), splitterJoined ); 1893 1894 Pipe pipe = new GroupBy( pipeLower, new Fields( "numA" ) ); 1895 1896 pipe = new Every( pipe, Fields.ALL, new TestIdentityBuffer( new Fields( "numA" ), 5, false ), Fields.RESULTS ); 1897 1898 Pipe lhsPipe = new Each( pipe, new Identity() ); 1899 lhsPipe = new HashJoin( "lhs", lhsPipe, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) ); 1900 1901 Pipe rhsPipe = new Each( pipe, new Identity() ); 1902 rhsPipe = new HashJoin( "rhs", rhsPipe, new Fields( "numA" ), pipeJoined, new Fields( "numC" ) ); 1903 1904 Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, lhsPipe, rhsPipe ); 1905 1906 if( getPlatform().isMapReduce() ) 1907 assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() ); 1908 1909 flow.complete(); 1910 1911 validateLength( flow.openSink( "lhs" ), 5, null ); 1912 validateLength( flow.openSink( "rhs" ), 5, null ); 1913 1914 List<Tuple> lhsActual = asList( flow, lhsSink ); 1915 1916 assertTrue( lhsActual.contains( new Tuple( "1\ta\t1\tA" ) ) ); 1917 assertTrue( lhsActual.contains( new Tuple( "2\tb\t2\tB" ) ) ); 1918 1919 List<Tuple> rhsActual = asList( flow, rhsSink ); 1920 1921 assertTrue( rhsActual.contains( new Tuple( "1\ta\t1\ta\tA" ) ) ); 1922 assertTrue( rhsActual.contains( new Tuple( "2\tb\t2\tb\tB" ) ) ); 1923 } 1924 1925 /** 1926 * When run against a cluster a Merge before a GroupBy can hide the streamed/accumulated nature of a branch. 1927 * <p> 1928 * The planner nw 1929 * <p> 1930 * commented code is for troubleshooting. 1931 * 1932 * @throws Exception 1933 */ 1934 @Test 1935 public void testJoinMergeGroupBy() throws Exception 1936 { 1937 getPlatform().copyFromLocal( inputFileNums10 ); 1938 getPlatform().copyFromLocal( inputFileNums20 ); 1939 1940 Tap lhsTap = getPlatform().getTextFile( new Fields( "id" ), inputFileNums10 ); 1941 Tap rhsTap = getPlatform().getTextFile( new Fields( "id2" ), inputFileNums20 ); 1942 1943 Pipe lhs = new Pipe( "lhs" ); 1944 Pipe rhs = new Pipe( "rhs" ); 1945 1946// Pipe joined = new CoGroup( messages, new Fields( "id" ), people, new Fields( "id2" ) ); 1947 Pipe joined = new HashJoin( lhs, new Fields( "id" ), rhs, new Fields( "id2" ) ); 1948 1949 Pipe pruned = new Each( joined, new Fields( "id2" ), new Identity(), Fields.RESULTS ); 1950// pruned = new Checkpoint( pruned ); 1951 Pipe merged = new Merge( pruned, rhs ); 1952 Pipe grouped = new GroupBy( merged, new Fields( "id2" ) ); 1953// Pipe grouped = new GroupBy( Pipe.pipes( pruned, people ), new Fields( "id2" ) ); 1954 Aggregator count = new Count( new Fields( "count" ) ); 1955 Pipe counted = new Every( grouped, count ); 1956 1957 String testJoinMerge = "testJoinMergeGroupBy/" + ( ( joined instanceof CoGroup ) ? "cogroup" : "hashjoin" ); 1958 Tap sink = getPlatform().getDelimitedFile( Fields.ALL, true, "\t", null, getOutputPath( testJoinMerge ), SinkMode.REPLACE ); 1959 1960 FlowDef flowDef = FlowDef.flowDef() 1961 .setName( "join-merge" ) 1962 .addSource( rhs, rhsTap ) 1963 .addSource( lhs, lhsTap ) 1964 .addTailSink( counted, sink ); 1965 1966 Flow flow = getPlatform().getFlowConnector().connect( flowDef ); 1967 1968// flow.writeDOT( "joinmerge.dot" ); 1969// flow.writeStepsDOT( "joinmerge-steps.dot" ); 1970 1971 flow.complete(); 1972 1973 validateLength( flow, 20 ); 1974 1975 List<Tuple> values = getSinkAsList( flow ); 1976 List<Tuple> expected = new ArrayList<Tuple>(); 1977 1978 expected.add( new Tuple( "1", "2" ) ); 1979 expected.add( new Tuple( "10", "2" ) ); 1980 expected.add( new Tuple( "11", "1" ) ); 1981 expected.add( new Tuple( "12", "1" ) ); 1982 expected.add( new Tuple( "13", "1" ) ); 1983 expected.add( new Tuple( "14", "1" ) ); 1984 expected.add( new Tuple( "15", "1" ) ); 1985 expected.add( new Tuple( "16", "1" ) ); 1986 expected.add( new Tuple( "17", "1" ) ); 1987 expected.add( new Tuple( "18", "1" ) ); 1988 expected.add( new Tuple( "19", "1" ) ); 1989 expected.add( new Tuple( "2", "2" ) ); 1990 expected.add( new Tuple( "20", "1" ) ); 1991 expected.add( new Tuple( "3", "2" ) ); 1992 expected.add( new Tuple( "4", "2" ) ); 1993 expected.add( new Tuple( "5", "2" ) ); 1994 expected.add( new Tuple( "6", "2" ) ); 1995 expected.add( new Tuple( "7", "2" ) ); 1996 expected.add( new Tuple( "8", "2" ) ); 1997 expected.add( new Tuple( "9", "2" ) ); 1998 1999 Collections.sort( values ); 2000 Collections.sort( expected ); 2001 2002 assertEquals( expected, values ); 2003 } 2004 2005 /** 2006 * Under tez, this can result in the HashJoin being duplicated across nodes for each split after the HashJoin 2007 * BoundaryBalanceJoinSplitTransformer inserts a Boundary at the split, preventing duplication of the path 2008 * 2009 * @throws Exception 2010 */ 2011 @Test 2012 public void testJoinSplit() throws Exception 2013 { 2014 getPlatform().copyFromLocal( inputFileLhs ); 2015 getPlatform().copyFromLocal( inputFileRhs ); 2016 2017 FlowDef flowDef = FlowDef.flowDef() 2018 .addSource( "lhs", getPlatform().getTextFile( inputFileLhs ) ) 2019 .addSource( "rhs", getPlatform().getTextFile( inputFileRhs ) ) 2020 .addSink( "lhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ) ) 2021 .addSink( "rhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ) ); 2022 2023 Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); 2024 Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); 2025 2026 Pipe join = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); 2027 2028 Pipe pipeLhs = new Each( new Pipe( "lhsSink", join ), new Identity() ); 2029 Pipe pipeRhs = new Each( new Pipe( "rhsSink", join ), new Identity() ); 2030 2031 flowDef 2032 .addTail( pipeLhs ) 2033 .addTail( pipeRhs ); 2034 2035 Flow flow = getPlatform().getFlowConnector().connect( flowDef ); 2036 2037 flow.complete(); 2038 2039 validateLength( flow, 37, null ); 2040 2041 List<Tuple> values = asList( flow, flowDef.getSinks().get( "lhsSink" ) ); 2042 2043 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 2044 assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); 2045 2046 values = asList( flow, flowDef.getSinks().get( "rhsSink" ) ); 2047 2048 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 2049 assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); 2050 } 2051 2052 /** 2053 * catches a situation where BottomUpJoinedBoundariesNodePartitioner may capture an invalid HashJoin sub-graph 2054 * if the in-bound Boundary is split upon. 2055 */ 2056 @Test 2057 public void testSameSourceJoinSplitIntoJoin() throws Exception 2058 { 2059 getPlatform().copyFromLocal( inputFileLhs ); 2060 getPlatform().copyFromLocal( inputFileRhs ); 2061 2062 FlowDef flowDef = FlowDef.flowDef() 2063 .addSource( "lhs", getPlatform().getTextFile( inputFileLhs ) ) 2064 .addSource( "rhs", getPlatform().getTextFile( inputFileLhs ) ) 2065 .addSource( "joinSecond", getPlatform().getTextFile( inputFileRhs ) ) 2066 .addSink( "lhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ) ) 2067 .addSink( "rhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ) ); 2068 2069 Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); 2070 Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); 2071 2072 Pipe joinFirst = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); 2073 2074 Pipe pipeLhs = new Each( new Pipe( "lhsSink", joinFirst ), new Identity() ); 2075 2076 Pipe joinSecond = new Each( "joinSecond", new Fields( "line" ), new RegexSplitter( new Fields( "numRHSSecond", "charRHSSecond" ), " " ) ); 2077 2078 joinSecond = new HashJoin( joinFirst, new Fields( "numLHS" ), joinSecond, new Fields( "numRHSSecond" ) ); 2079 2080 Pipe pipeRhs = new Each( new Pipe( "rhsSink", joinSecond ), new Identity() ); 2081 2082 flowDef 2083 .addTail( pipeLhs ) 2084 .addTail( pipeRhs ); 2085 2086 Flow flow = getPlatform().getFlowConnector().connect( flowDef ); 2087 2088 flow.complete(); 2089 2090 List<Tuple> values = asList( flow, flowDef.getSinks().get( "lhsSink" ) ); 2091 2092 assertEquals( 37, values.size() ); 2093 assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) ); 2094 assertTrue( values.contains( new Tuple( "1\ta\t1\tb" ) ) ); 2095 2096 values = asList( flow, flowDef.getSinks().get( "rhsSink" ) ); 2097 2098 assertEquals( 109, values.size() ); 2099 assertTrue( values.contains( new Tuple( "1\ta\t1\ta\t1\tA" ) ) ); 2100 assertTrue( values.contains( new Tuple( "1\ta\t1\tb\t1\tB" ) ) ); 2101 } 2102 2103 /** 2104 * checks that a split after a HashJoin does not result in the HashJoin execution being duplicated across 2105 * multiple nodes, one for each branch in the split. 2106 */ 2107 @Test 2108 public void testJoinSplitBeforeJoin() throws Exception 2109 { 2110 getPlatform().copyFromLocal( inputFileLhs ); 2111 getPlatform().copyFromLocal( inputFileRhs ); 2112 2113 FlowDef flowDef = FlowDef.flowDef() 2114 .addSource( "lhs", getPlatform().getTextFile( inputFileLhs ) ) 2115 .addSource( "rhs", getPlatform().getTextFile( inputFileRhs ) ) 2116 .addSource( "joinSecond", getPlatform().getTextFile( inputFileRhs ) ) 2117 .addSink( "lhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ) ) 2118 .addSink( "rhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ) ); 2119 2120 Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); 2121 Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); 2122 2123 pipeUpper = new Checkpoint( pipeUpper ); 2124 2125 HashJoin hashJoin = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); 2126 2127 Pipe joinFirst = hashJoin; 2128 2129 joinFirst = new Each( joinFirst, new Identity() ); 2130 2131 Pipe pipeLhs = new Each( new Pipe( "lhsSink", joinFirst ), new Identity() ); 2132 2133 pipeLhs = new GroupBy( pipeLhs, new Fields( "numLHS" ) ); 2134 2135 joinFirst = new Each( new Pipe( "lhsSplit", joinFirst ), new Identity() ); 2136 2137 Pipe joinSecond = new Each( "joinSecond", new Fields( "line" ), new RegexSplitter( new Fields( "numRHSSecond", "charRHSSecond" ), " " ) ); 2138 2139 joinSecond = new CoGroup( joinFirst, new Fields( "numLHS" ), joinSecond, new Fields( "numRHSSecond" ) ); 2140 2141 Pipe pipeRhs = new Each( new Pipe( "rhsSink", joinSecond ), new Identity() ); 2142 2143 flowDef 2144 .addTail( pipeLhs ) 2145 .addTail( pipeRhs ); 2146 2147 Flow flow = getPlatform().getFlowConnector().connect( flowDef ); 2148 2149 if( getPlatform().isDAG() ) 2150 { 2151 FlowStep flowStep = (FlowStep) flow.getFlowSteps().get( 0 ); 2152 List<ElementGraph> elementGraphs = flowStep.getFlowNodeGraph().getElementGraphs( hashJoin ); 2153 2154 assertEquals( 1, elementGraphs.size() ); 2155 } 2156 2157 flow.complete(); 2158 2159 List<Tuple> values = asList( flow, flowDef.getSinks().get( "lhsSink" ) ); 2160 2161 assertEquals( 37, values.size() ); 2162 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 2163 assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); 2164 2165 values = asList( flow, flowDef.getSinks().get( "rhsSink" ) ); 2166 2167 assertEquals( 109, values.size() ); 2168 assertTrue( values.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) ); 2169 assertTrue( values.contains( new Tuple( "1\ta\t1\tB\t1\tB" ) ) ); 2170 } 2171 2172 @Test 2173 public void testGroupBySplitGroupByJoin() throws Exception 2174 { 2175 getPlatform().copyFromLocal( inputFileLower ); 2176 2177 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 2178 2179 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink" ), SinkMode.REPLACE ); 2180 2181 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 2182 2183 Pipe pipeFirst = new Pipe( "first" ); 2184 pipeFirst = new Each( pipeFirst, new Fields( "line" ), splitter ); 2185 pipeFirst = new GroupBy( pipeFirst, new Fields( "num" ) ); 2186 pipeFirst = new Every( pipeFirst, new Fields( "char" ), new First( new Fields( "firstFirst" ) ), Fields.ALL ); 2187 2188 Pipe pipeSecond = new Pipe( "second", pipeFirst ); 2189 pipeSecond = new Each( pipeSecond, new Identity() ); 2190 pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) ); 2191 pipeSecond = new Every( pipeSecond, new Fields( "firstFirst" ), new First( new Fields( "secondFirst" ) ), Fields.ALL ); 2192 pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) ); 2193 pipeSecond = new Every( pipeSecond, new Fields( "secondFirst" ), new First( new Fields( "thirdFirst" ) ), Fields.ALL ); 2194 2195 Pipe splice = new HashJoin( pipeFirst, new Fields( "num" ), pipeSecond, new Fields( "num" ), Fields.size( 4 ) ); 2196 2197 Flow flow = getPlatform().getFlowConnector().connect( source, sink, splice ); 2198 2199 flow.complete(); 2200 2201 validateLength( flow, 5, null ); 2202 2203 List<Tuple> values = getSinkAsList( flow ); 2204 2205 assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) ); 2206 assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) ); 2207 assertTrue( values.contains( new Tuple( "3\tc\t3\tc" ) ) ); 2208 assertTrue( values.contains( new Tuple( "4\td\t4\td" ) ) ); 2209 assertTrue( values.contains( new Tuple( "5\te\t5\te" ) ) ); 2210 } 2211 2212 @Test 2213 public void testGroupBySplitSplitGroupByJoin() throws Exception 2214 { 2215 getPlatform().copyFromLocal( inputFileLower ); 2216 2217 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 2218 2219 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink" ), SinkMode.REPLACE ); 2220 2221 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 2222 2223 Pipe pipeFirst = new Pipe( "first" ); 2224 pipeFirst = new Each( pipeFirst, new Fields( "line" ), splitter ); 2225 pipeFirst = new GroupBy( pipeFirst, new Fields( "num" ) ); 2226 pipeFirst = new Every( pipeFirst, new Fields( "char" ), new First( new Fields( "firstFirst" ) ), Fields.ALL ); 2227 2228 Pipe pipeSecond = new Pipe( "second", pipeFirst ); 2229 pipeSecond = new Each( pipeSecond, new Identity() ); 2230 pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) ); 2231 pipeSecond = new Every( pipeSecond, new Fields( "firstFirst" ), new First( new Fields( "secondFirst" ) ), Fields.ALL ); 2232 2233 Pipe splice = new HashJoin( pipeFirst, new Fields( "num" ), pipeSecond, new Fields( "num" ), Fields.size( 4 ) ); 2234// Pipe splice = new HashJoin( pipeSecond, new Fields( "num" ), pipeFirst, new Fields( "num" ), Fields.size( 4 ) ); 2235 2236 splice = new HashJoin( splice, new Fields( 0 ), pipeSecond, new Fields( "num" ), Fields.size( 6 ) ); 2237 2238 Flow flow = getPlatform().getFlowConnector().connect( source, sink, splice ); 2239 2240 flow.complete(); 2241 2242 validateLength( flow, 5, null ); 2243 2244 List<Tuple> values = getSinkAsList( flow ); 2245 2246 assertTrue( values.contains( new Tuple( "1\ta\t1\ta\t1\ta" ) ) ); 2247 assertTrue( values.contains( new Tuple( "2\tb\t2\tb\t2\tb" ) ) ); 2248 assertTrue( values.contains( new Tuple( "3\tc\t3\tc\t3\tc" ) ) ); 2249 assertTrue( values.contains( new Tuple( "4\td\t4\td\t4\td" ) ) ); 2250 assertTrue( values.contains( new Tuple( "5\te\t5\te\t5\te" ) ) ); 2251 } 2252 2253 @Test 2254 public void testGroupBySplitAroundSplitGroupByJoin() throws Exception 2255 { 2256 getPlatform().copyFromLocal( inputFileLower ); 2257 2258 Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 2259 2260 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink" ), SinkMode.REPLACE ); 2261 Tap sink2 = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink2" ), SinkMode.REPLACE ); 2262 2263 Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); 2264 2265 Pipe pipeInit = new Pipe( "init" ); 2266 Pipe pipeFirst = new Pipe( "first", pipeInit ); 2267 pipeFirst = new Each( pipeFirst, new Fields( "line" ), splitter ); 2268 pipeFirst = new GroupBy( pipeFirst, new Fields( "num" ) ); 2269 pipeFirst = new Every( pipeFirst, new Fields( "char" ), new First( new Fields( "firstFirst" ) ), Fields.ALL ); 2270 2271 Pipe sink2Pipe = new Pipe( "sink2", pipeFirst ); 2272 2273 Pipe pipeSecond = new Pipe( "second", pipeInit ); 2274 pipeSecond = new Each( pipeSecond, new Fields( "line" ), splitter ); 2275 pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) ); 2276 pipeSecond = new Every( pipeSecond, new Fields( "char" ), new First( new Fields( "secondFirst" ) ), Fields.ALL ); 2277 2278// Pipe splice = new HashJoin( pipeFirst, new Fields( "num" ), pipeSecond, new Fields( "num" ), Fields.size( 4 ) ); 2279 Pipe splice = new HashJoin( pipeSecond, new Fields( "num" ), pipeFirst, new Fields( "num" ), Fields.size( 4 ) ); 2280 2281 Pipe pipeThird = new Pipe( "third", pipeSecond ); 2282 pipeThird = new Each( pipeThird, new Identity() ); 2283 pipeThird = new GroupBy( pipeThird, new Fields( "num" ) ); 2284 pipeThird = new Every( pipeThird, new Fields( "secondFirst" ), new First( new Fields( "thirdFirst" ) ), Fields.ALL ); 2285 2286 splice = new HashJoin( splice, new Fields( 0 ), pipeThird, new Fields( "num" ), Fields.size( 6 ) ); 2287 2288 FlowDef flowDef = FlowDef.flowDef() 2289 .setName( splice.getName() ) 2290 .addSource( "init", source ) 2291 .addTailSink( splice, sink ) 2292 .addTailSink( sink2Pipe, sink2 ); 2293 2294 Flow flow = getPlatform().getFlowConnector().connect( flowDef ); 2295 2296 flow.complete(); 2297 2298 validateLength( flow, 5, null ); 2299 2300 List<Tuple> values = getSinkAsList( flow ); 2301 2302 assertTrue( values.contains( new Tuple( "1\ta\t1\ta\t1\ta" ) ) ); 2303 assertTrue( values.contains( new Tuple( "2\tb\t2\tb\t2\tb" ) ) ); 2304 assertTrue( values.contains( new Tuple( "3\tc\t3\tc\t3\tc" ) ) ); 2305 assertTrue( values.contains( new Tuple( "4\td\t4\td\t4\td" ) ) ); 2306 assertTrue( values.contains( new Tuple( "5\te\t5\te\t5\te" ) ) ); 2307 } 2308 2309 /** 2310 * This test checks for a deadlock when the same input is forked, adapted on one edge, then hashjoined back together. 2311 * 2312 * @throws Exception 2313 */ 2314 @Test 2315 public void testForkThenJoin() throws Exception 2316 { 2317 getPlatform().copyFromLocal( inputFileLower ); 2318 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 2319 2320 Map sources = new HashMap(); 2321 2322 sources.put( "lower", sourceLower ); 2323 2324 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); 2325 2326 Function splitter = new RegexSplitter( new Fields( "num", "text" ), " " ); 2327 2328 Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); 2329 Pipe pipeUpper = new Each( new Pipe( "upper", pipeLower ), new Fields( "text" ), 2330 new ExpressionFunction( Fields.ARGS, "text.toUpperCase(java.util.Locale.ROOT)", String.class ), 2331 Fields.REPLACE ); 2332 2333 Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); 2334 2335 Map<Object, Object> properties = getProperties(); 2336 2337 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 2338 2339 flow.complete(); 2340 2341 validateLength( flow, 5 ); 2342 2343 List<Tuple> values = getSinkAsList( flow ); 2344 2345 assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); 2346 assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); 2347 } 2348 2349 /** 2350 * This test checks for a deadlock when the same input is forked, adapted on one edge, then hashjoined back together. 2351 * 2352 * @throws Exception 2353 */ 2354 @Test 2355 public void testForkCoGroupThenHashJoin() throws Exception 2356 { 2357 getPlatform().copyFromLocal( inputFileLower ); 2358 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 2359 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 2360 2361 Map sources = new HashMap(); 2362 2363 sources.put( "sourceLower", sourceLower ); 2364 sources.put( "sourceUpper", sourceUpper ); 2365 2366 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); 2367 2368 Function splitter = new RegexSplitter( new Fields( "num", "text" ), " " ); 2369 2370 Pipe leftPipeLower = new Each( new Pipe( "sourceLower" ), new Fields( "line" ), splitter ); 2371 Pipe rightPipeUpper = new Each( new Pipe( "sourceUpper" ), new Fields( "line" ), splitter ); 2372 2373 Pipe leftPipeUpper = new Each( new Pipe( "leftUpper", leftPipeLower ), new Fields( "text" ), 2374 new ExpressionFunction( Fields.ARGS, "text.toUpperCase(java.util.Locale.ROOT)", String.class ), 2375 Fields.REPLACE ); 2376 Pipe rightPipeLower = new Each( new Pipe( "rightLower", rightPipeUpper ), new Fields( "text" ), 2377 new ExpressionFunction( Fields.ARGS, "text.toLowerCase(java.util.Locale.ROOT)", String.class ), 2378 Fields.REPLACE ); 2379 2380 leftPipeUpper = new GroupBy( leftPipeUpper, new Fields( "num" ) ); 2381 rightPipeLower = new GroupBy( rightPipeLower, new Fields( "num" ) ); 2382 2383 Pipe middleSplice = new CoGroup( "middleCoGroup", leftPipeUpper, new Fields( "num" ), rightPipeLower, new Fields( "num" ), new Fields( "numM1", "charM1", "numM2", "charM2" ) ); 2384 2385 Pipe leftSplice = new HashJoin( leftPipeLower, new Fields( "num" ), middleSplice, new Fields( "numM1" ) ); 2386 2387 Map<Object, Object> properties = getProperties(); 2388 2389 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, leftSplice ); 2390 2391 flow.complete(); 2392 2393 validateLength( flow, 5 ); 2394 2395 List<Tuple> values = getSinkAsList( flow ); 2396 // that the flow completes at all is already success. 2397 assertTrue( values.contains( new Tuple( "1\ta\t1\tA\t1\ta" ) ) ); 2398 assertTrue( values.contains( new Tuple( "2\tb\t2\tB\t2\tb" ) ) ); 2399 } 2400 2401 /** 2402 * This test checks for a deadlock when the same input is forked, adapted on one edge, cogroup with something, 2403 * then hashjoined back together. 2404 * 2405 * @throws Exception 2406 */ 2407 @Test 2408 public void testForkCoGroupThenHashJoinCoGroupAgain() throws Exception 2409 { 2410 getPlatform().copyFromLocal( inputFileLower ); 2411 getPlatform().copyFromLocal( inputFileUpper ); 2412 2413 Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); 2414 Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); 2415 2416 Map sources = new HashMap(); 2417 2418 sources.put( "sourceLower", sourceLower ); 2419 sources.put( "sourceUpper", sourceUpper ); 2420 2421 Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE ); 2422 2423 Function splitter = new RegexSplitter( new Fields( "num", "text" ), " " ); 2424 2425 Pipe leftPipeLower = new Each( new Pipe( "sourceLower" ), new Fields( "line" ), splitter ); 2426 Pipe rightPipeUpper = new Each( new Pipe( "sourceUpper" ), new Fields( "line" ), splitter ); 2427 2428 Pipe leftPipeUpper = new Each( new Pipe( "leftUpper", leftPipeLower ), new Fields( "text" ), 2429 new ExpressionFunction( Fields.ARGS, "text.toUpperCase(java.util.Locale.ROOT)", String.class ), 2430 Fields.REPLACE ); 2431 Pipe rightPipeLower = new Each( new Pipe( "rightLower", rightPipeUpper ), new Fields( "text" ), 2432 new ExpressionFunction( Fields.ARGS, "text.toLowerCase(java.util.Locale.ROOT)", String.class ), 2433 Fields.REPLACE ); 2434 2435 leftPipeUpper = new GroupBy( leftPipeUpper, new Fields( "num" ) ); 2436 rightPipeLower = new GroupBy( rightPipeLower, new Fields( "num" ) ); 2437 2438 Pipe middleSplice = new CoGroup( "middleCoGroup", leftPipeUpper, new Fields( "num" ), rightPipeLower, new Fields( "num" ), new Fields( "numM1", "charM1", "numM2", "charM2" ) ); 2439 2440 Pipe leftSplice = new HashJoin( leftPipeLower, new Fields( "num" ), middleSplice, new Fields( "numM1" ) ); 2441 Pipe rightSplice = new HashJoin( rightPipeUpper, new Fields( "num" ), middleSplice, new Fields( "numM2" ) ); 2442 2443 leftSplice = new Rename( leftSplice, new Fields( "num", "text", "numM1", "charM1", "numM2", "charM2" ), new Fields( "numL1", "charL1", "numM1L", "charM1L", "numM2L", "charM2L" ) ); 2444 rightSplice = new Rename( rightSplice, new Fields( "num", "text", "numM1", "charM1", "numM2", "charM2" ), new Fields( "numR1", "charR1", "numM1R", "charM1R", "numM2R", "charM2R" ) ); 2445 2446 leftSplice = new GroupBy( leftSplice, new Fields( "numM1L" ) ); 2447 rightSplice = new GroupBy( rightSplice, new Fields( "numM2R" ) ); 2448 2449 Pipe splice = new CoGroup( "cogrouping", leftSplice, new Fields( "numM1L" ), rightSplice, new Fields( "numM2R" ) ); 2450 2451 Map<Object, Object> properties = getProperties(); 2452 2453 Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); 2454 2455 flow.complete(); 2456 2457 validateLength( flow, 5 ); 2458 2459 List<Tuple> values = getSinkAsList( flow ); 2460 2461 // getting this far is a success already (past old deadlocks) 2462 assertTrue( values.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA\t1\tA\t1\ta" ) ) ); 2463 assertTrue( values.contains( new Tuple( "2\tb\t2\tB\t2\tb\t2\tB\t2\tB\t2\tb" ) ) ); 2464 } 2465 }