001/*
002 * Copyright (c) 2007-2022 The Cascading Authors. All Rights Reserved.
003 *
004 * Project and contact information: https://cascading.wensel.net/
005 *
006 * This file is part of the Cascading project.
007 *
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *     http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 */
020
021package cascading;
022
023import java.io.Serializable;
024import java.util.ArrayList;
025import java.util.Collections;
026import java.util.Comparator;
027import java.util.HashMap;
028import java.util.HashSet;
029import java.util.List;
030import java.util.Map;
031import java.util.Set;
032
033import cascading.flow.Flow;
034import cascading.flow.FlowDef;
035import cascading.flow.FlowStep;
036import cascading.flow.planner.graph.ElementGraph;
037import cascading.operation.Aggregator;
038import cascading.operation.Function;
039import cascading.operation.Identity;
040import cascading.operation.aggregator.Count;
041import cascading.operation.aggregator.First;
042import cascading.operation.expression.ExpressionFunction;
043import cascading.operation.regex.RegexFilter;
044import cascading.operation.regex.RegexSplitter;
045import cascading.pipe.Checkpoint;
046import cascading.pipe.CoGroup;
047import cascading.pipe.Each;
048import cascading.pipe.Every;
049import cascading.pipe.GroupBy;
050import cascading.pipe.HashJoin;
051import cascading.pipe.Merge;
052import cascading.pipe.Pipe;
053import cascading.pipe.assembly.Rename;
054import cascading.pipe.joiner.InnerJoin;
055import cascading.pipe.joiner.Joiner;
056import cascading.pipe.joiner.LeftJoin;
057import cascading.pipe.joiner.MixedJoin;
058import cascading.pipe.joiner.OuterJoin;
059import cascading.pipe.joiner.RightJoin;
060import cascading.tap.SinkMode;
061import cascading.tap.Tap;
062import cascading.tuple.Fields;
063import cascading.tuple.Hasher;
064import cascading.tuple.Tuple;
065import org.junit.Test;
066
067import static data.InputData.*;
068
069public class JoinFieldedPipesPlatformTest extends PlatformTestCase
070  {
071  public JoinFieldedPipesPlatformTest()
072    {
073    super( true, 4, 1 ); // leave cluster testing enabled
074    }
075
076  @Test
077  public void testCross() throws Exception
078    {
079    getPlatform().copyFromLocal( inputFileLhs );
080    getPlatform().copyFromLocal( inputFileRhs );
081
082    Map sources = new HashMap();
083
084    sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) );
085    sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) );
086
087    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE );
088
089    Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
090    Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
091
092    Pipe cross = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
093
094    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross );
095
096    flow.complete();
097
098    validateLength( flow, 37, null );
099
100    List<Tuple> values = getSinkAsList( flow );
101
102    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
103    assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) );
104    }
105
106  @Test
107  public void testJoin() throws Exception
108    {
109    getPlatform().copyFromLocal( inputFileLower );
110    getPlatform().copyFromLocal( inputFileUpper );
111
112    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
113    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
114
115    Map sources = new HashMap();
116
117    sources.put( "lower", sourceLower );
118    sources.put( "upper", sourceUpper );
119
120    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE );
121
122    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
123
124    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
125    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
126
127    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
128
129    Map<Object, Object> properties = getProperties();
130
131    Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
132
133    flow.complete();
134
135    validateLength( flow, 5 );
136
137    List<Tuple> values = getSinkAsList( flow );
138
139    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
140    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
141    }
142
143  @Test
144  public void testJoinSamePipeName() throws Exception
145    {
146    getPlatform().copyFromLocal( inputFileLower );
147    getPlatform().copyFromLocal( inputFileUpper );
148
149    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
150    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
151
152    Map sources = new HashMap();
153
154    sources.put( "lower", sourceLower );
155    sources.put( "upper", sourceUpper );
156
157    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "renamedpipes" ), SinkMode.REPLACE );
158
159    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
160
161    Pipe pipeLower = new Pipe( "lower" );
162    Pipe pipeUpper = new Pipe( "upper" );
163
164    // these pipes will hide the source name, and could cause one to be lost
165    pipeLower = new Pipe( "same", pipeLower );
166    pipeUpper = new Pipe( "same", pipeUpper );
167
168    pipeLower = new Each( pipeLower, new Fields( "line" ), splitter );
169    pipeUpper = new Each( pipeUpper, new Fields( "line" ), splitter );
170
171//    pipeLower = new Each( pipeLower, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) );
172//    pipeUpper = new Each( pipeUpper, new Fields( "num", "char" ), new Identity( new Fields( "num", "char" ) ) );
173
174    pipeLower = new Pipe( "left", pipeLower );
175    pipeUpper = new Pipe( "right", pipeUpper );
176
177//    pipeLower = new Each( pipeLower, new Debug( true ) );
178//    pipeUpper = new Each( pipeUpper, new Debug( true ) );
179
180    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
181
182//    splice = new Each( splice, new Debug( true ) );
183    splice = new Pipe( "splice", splice );
184    splice = new Pipe( "tail", splice );
185
186    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
187
188    flow.complete();
189
190    validateLength( flow, 5 );
191
192    List<Tuple> values = getSinkAsList( flow );
193
194    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
195    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
196    }
197
198  @Test
199  public void testJoinWithUnknowns() throws Exception
200    {
201    getPlatform().copyFromLocal( inputFileLower );
202    getPlatform().copyFromLocal( inputFileUpper );
203
204    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
205    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
206
207    Map sources = new HashMap();
208
209    sources.put( "lower", sourceLower );
210    sources.put( "upper", sourceUpper );
211
212    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "unknown" ), SinkMode.REPLACE );
213
214    Function splitter = new RegexSplitter( Fields.UNKNOWN, " " );
215
216    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
217    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
218
219    Pipe splice = new HashJoin( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) );
220
221    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
222
223    flow.complete();
224
225    validateLength( flow, 5 );
226
227    List<Tuple> values = getSinkAsList( flow );
228
229    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
230    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
231    }
232
233  /**
234   * this test intentionally filters out all values so the intermediate tap is empty. this tap is cogrouped with
235   * a new stream using an outerjoin.
236   *
237   * @throws Exception
238   */
239  @Test
240  public void testJoinFilteredBranch() throws Exception
241    {
242    getPlatform().copyFromLocal( inputFileLower );
243    getPlatform().copyFromLocal( inputFileUpper );
244
245    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
246    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
247
248    Map sources = new HashMap();
249
250    sources.put( "lower", sourceLower );
251    sources.put( "upper", sourceUpper );
252
253    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinfilteredbranch" ), SinkMode.REPLACE );
254
255    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
256
257    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
258    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
259    pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
260    pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
261
262    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() );
263
264    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
265
266    flow.complete();
267
268    validateLength( flow, 5 );
269
270    List<Tuple> values = getSinkAsList( flow );
271
272    assertTrue( values.contains( new Tuple( "1\ta\tnull\tnull" ) ) );
273    assertTrue( values.contains( new Tuple( "2\tb\tnull\tnull" ) ) );
274    }
275
276  @Test
277  public void testJoinSelf() throws Exception
278    {
279    getPlatform().copyFromLocal( inputFileLhs );
280
281    Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
282    Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
283
284    Map sources = new HashMap();
285
286    sources.put( "lhs", sourceLhs );
287    sources.put( "rhs", sourceRhs );
288
289    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinself" ), SinkMode.REPLACE );
290
291    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
292
293    Pipe pipeLower = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
294    Pipe pipeUpper = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
295
296    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
297
298    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
299
300    flow.complete();
301
302    validateLength( flow, 37 );
303
304    List<Tuple> values = getSinkAsList( flow );
305
306    assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) );
307    assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) );
308    }
309
310  @Test
311  public void testSameSourceJoin() throws Exception
312    {
313    getPlatform().copyFromLocal( inputFileLhs );
314
315    Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLhs );
316
317    Map sources = new HashMap();
318
319    sources.put( "lhs", source );
320    sources.put( "rhs", source );
321
322    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath(), SinkMode.REPLACE );
323
324    Pipe pipeLower = new Pipe( "lhs" );
325    Pipe pipeUpper = new Pipe( "rhs" );
326
327    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
328
329    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
330
331    flow.complete();
332
333    validateLength( flow, 37 );
334
335    List<Tuple> values = getSinkAsList( flow );
336
337    assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) );
338    assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) );
339    }
340
341  /**
342   * Method testCoGroupAfterEvery tests that a tmp tap is inserted after the Every in the cogroup join
343   *
344   * @throws Exception when
345   */
346  @Test
347  public void testJoinAfterEvery() throws Exception
348    {
349    getPlatform().copyFromLocal( inputFileLower );
350    getPlatform().copyFromLocal( inputFileUpper );
351
352    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
353    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
354
355    Map sources = new HashMap();
356
357    sources.put( "lower", sourceLower );
358    sources.put( "upper", sourceUpper );
359
360    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "afterevery" ), SinkMode.REPLACE );
361
362    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
363
364    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
365    pipeLower = new GroupBy( pipeLower, new Fields( "num" ) );
366    pipeLower = new Every( pipeLower, new Fields( "char" ), new First(), Fields.ALL );
367
368    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
369    pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
370    pipeUpper = new Every( pipeUpper, new Fields( "char" ), new First(), Fields.ALL );
371
372    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
373
374    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
375
376    flow.complete();
377
378    validateLength( flow, 5, null );
379
380    List<Tuple> values = getSinkAsList( flow );
381
382    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
383    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
384    }
385
386  @Test
387  public void testJoinInnerSingleField() throws Exception
388    {
389    getPlatform().copyFromLocal( inputFileLowerOffset );
390    getPlatform().copyFromLocal( inputFileUpper );
391
392    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset );
393    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
394
395    Map sources = new HashMap();
396
397    sources.put( "lower", sourceLower );
398    sources.put( "upper", sourceUpper );
399
400    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joininnersingle" ), SinkMode.REPLACE );
401
402    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char" ), " " ), new Fields( "num1" ) );
403    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char" ), " " ), new Fields( "num2" ) );
404
405    Pipe join = new HashJoin( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) );
406
407    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
408
409    flow.complete();
410
411    validateLength( flow, 3, null );
412
413    Set<Tuple> results = new HashSet<Tuple>();
414
415    results.add( new Tuple( "1\t1" ) );
416    results.add( new Tuple( "5\t5" ) );
417
418    List<Tuple> actual = getSinkAsList( flow );
419
420    results.removeAll( actual );
421
422    assertEquals( 0, results.size() );
423    }
424
425  /**
426   * 1 a1
427   * 1 a2
428   * 1 a3
429   * 2 b1
430   * 3 c1
431   * 4 d1
432   * 4 d2
433   * 4 d3
434   * 5 e1
435   * 5 e2
436   * 5 e3
437   * 7 g1
438   * 7 g2
439   * 7 g3
440   * 7 g4
441   * 7 g5
442   * null h1
443   * <p>
444   * 1 A1
445   * 1 A2
446   * 1 A3
447   * 2 B1
448   * 2 B2
449   * 2 B3
450   * 4 D1
451   * 6 F1
452   * 6 F2
453   * null H1
454   * <p>
455   * 1  a1      1       A1
456   * 1  a1      1       A2
457   * 1  a1      1       A3
458   * 1  a2      1       A1
459   * 1  a2      1       A2
460   * 1  a2      1       A3
461   * 1  a3      1       A1
462   * 1  a3      1       A2
463   * 1  a3      1       A3
464   * 2  b1      2       B1
465   * 2  b1      2       B2
466   * 2  b1      2       B3
467   * 4  d1      4       D1
468   * 4  d2      4       D1
469   * 4  d3      4       D1
470   * null h1  null  H1
471   *
472   * @throws Exception
473   */
474  @Test
475  public void testJoinInner() throws Exception
476    {
477    HashSet<Tuple> results = new HashSet<Tuple>();
478
479    results.add( new Tuple( "1", "a1", "1", "A1" ) );
480    results.add( new Tuple( "1", "a1", "1", "A2" ) );
481    results.add( new Tuple( "1", "a1", "1", "A3" ) );
482    results.add( new Tuple( "1", "a2", "1", "A1" ) );
483    results.add( new Tuple( "1", "a2", "1", "A2" ) );
484    results.add( new Tuple( "1", "a2", "1", "A3" ) );
485    results.add( new Tuple( "1", "a3", "1", "A1" ) );
486    results.add( new Tuple( "1", "a3", "1", "A2" ) );
487    results.add( new Tuple( "1", "a3", "1", "A3" ) );
488    results.add( new Tuple( "2", "b1", "2", "B1" ) );
489    results.add( new Tuple( "2", "b1", "2", "B2" ) );
490    results.add( new Tuple( "2", "b1", "2", "B3" ) );
491    results.add( new Tuple( "4", "d1", "4", "D1" ) );
492    results.add( new Tuple( "4", "d2", "4", "D1" ) );
493    results.add( new Tuple( "4", "d3", "4", "D1" ) );
494    results.add( new Tuple( null, "h1", null, "H1" ) );
495
496    handleJoins( "joininner", new InnerJoin(), results );
497    }
498
499  /**
500   * /**
501   * 1 a1
502   * 1 a2
503   * 1 a3
504   * 2 b1
505   * 3 c1
506   * 4 d1
507   * 4 d2
508   * 4 d3
509   * 5 e1
510   * 5 e2
511   * 5 e3
512   * 7 g1
513   * 7 g2
514   * 7 g3
515   * 7 g4
516   * 7 g5
517   * null h1
518   * <p>
519   * 1 A1
520   * 1 A2
521   * 1 A3
522   * 2 B1
523   * 2 B2
524   * 2 B3
525   * 4 D1
526   * 6 F1
527   * 6 F2
528   * null H1
529   * <p>
530   * 1  a1      1       A1
531   * 1  a1      1       A2
532   * 1  a1      1       A3
533   * 1  a2      1       A1
534   * 1  a2      1       A2
535   * 1  a2      1       A3
536   * 1  a3      1       A1
537   * 1  a3      1       A2
538   * 1  a3      1       A3
539   * 2  b1      2       B1
540   * 2  b1      2       B2
541   * 2  b1      2       B3
542   * 3  c1      null    null
543   * 4  d1      4       D1
544   * 4  d2      4       D1
545   * 4  d3      4       D1
546   * 5  e1      null    null
547   * 5  e2      null    null
548   * 5  e3      null    null
549   * null       null    6       F1
550   * null       null    6       F2
551   * 7  g1      null    null
552   * 7  g2      null    null
553   * 7  g3      null    null
554   * 7  g4      null    null
555   * 7  g5      null    null
556   * null h1  null  H1
557   *
558   * @throws Exception
559   */
560  @Test
561  public void testJoinOuter() throws Exception
562    {
563    // skip if hadoop cluster mode, outer joins don't behave the same
564    if( getPlatform().isMapReduce() && getPlatform().isUseCluster() )
565      return;
566
567    Set<Tuple> results = new HashSet<Tuple>();
568
569    results.add( new Tuple( "1", "a1", "1", "A1" ) );
570    results.add( new Tuple( "1", "a1", "1", "A2" ) );
571    results.add( new Tuple( "1", "a1", "1", "A3" ) );
572    results.add( new Tuple( "1", "a2", "1", "A1" ) );
573    results.add( new Tuple( "1", "a2", "1", "A2" ) );
574    results.add( new Tuple( "1", "a2", "1", "A3" ) );
575    results.add( new Tuple( "1", "a3", "1", "A1" ) );
576    results.add( new Tuple( "1", "a3", "1", "A2" ) );
577    results.add( new Tuple( "1", "a3", "1", "A3" ) );
578    results.add( new Tuple( "2", "b1", "2", "B1" ) );
579    results.add( new Tuple( "2", "b1", "2", "B2" ) );
580    results.add( new Tuple( "2", "b1", "2", "B3" ) );
581    results.add( new Tuple( "3", "c1", null, null ) );
582    results.add( new Tuple( "4", "d1", "4", "D1" ) );
583    results.add( new Tuple( "4", "d2", "4", "D1" ) );
584    results.add( new Tuple( "4", "d3", "4", "D1" ) );
585    results.add( new Tuple( "5", "e1", null, null ) );
586    results.add( new Tuple( "5", "e2", null, null ) );
587    results.add( new Tuple( "5", "e3", null, null ) );
588    results.add( new Tuple( null, null, "6", "F1" ) );
589    results.add( new Tuple( null, null, "6", "F2" ) );
590    results.add( new Tuple( "7", "g1", null, null ) );
591    results.add( new Tuple( "7", "g2", null, null ) );
592    results.add( new Tuple( "7", "g3", null, null ) );
593    results.add( new Tuple( "7", "g4", null, null ) );
594    results.add( new Tuple( "7", "g5", null, null ) );
595    results.add( new Tuple( null, "h1", null, "H1" ) );
596
597    handleJoins( "joinouter", new OuterJoin(), results );
598    }
599
600  /**
601   * 1 a1
602   * 1 a2
603   * 1 a3
604   * 2 b1
605   * 3 c1
606   * 4 d1
607   * 4 d2
608   * 4 d3
609   * 5 e1
610   * 5 e2
611   * 5 e3
612   * 7 g1
613   * 7 g2
614   * 7 g3
615   * 7 g4
616   * 7 g5
617   * null h1
618   * <p>
619   * 1 A1
620   * 1 A2
621   * 1 A3
622   * 2 B1
623   * 2 B2
624   * 2 B3
625   * 4 D1
626   * 6 F1
627   * 6 F2
628   * null H1
629   * <p>
630   * 1  a1      1       A1
631   * 1  a1      1       A2
632   * 1  a1      1       A3
633   * 1  a2      1       A1
634   * 1  a2      1       A2
635   * 1  a2      1       A3
636   * 1  a3      1       A1
637   * 1  a3      1       A2
638   * 1  a3      1       A3
639   * 2  b1      2       B1
640   * 2  b1      2       B2
641   * 2  b1      2       B3
642   * 3  c1      null    null
643   * 4  d1      4       D1
644   * 4  d2      4       D1
645   * 4  d3      4       D1
646   * 5  e1      null    null
647   * 5  e2      null    null
648   * 5  e3      null    null
649   * 7  g1      null    null
650   * 7  g2      null    null
651   * 7  g3      null    null
652   * 7  g4      null    null
653   * 7  g5      null    null
654   * null h1    null    H1
655   *
656   * @throws Exception
657   */
658  @Test
659  public void testJoinInnerOuter() throws Exception
660    {
661    Set<Tuple> results = new HashSet<Tuple>();
662
663    results.add( new Tuple( "1", "a1", "1", "A1" ) );
664    results.add( new Tuple( "1", "a1", "1", "A2" ) );
665    results.add( new Tuple( "1", "a1", "1", "A3" ) );
666    results.add( new Tuple( "1", "a2", "1", "A1" ) );
667    results.add( new Tuple( "1", "a2", "1", "A2" ) );
668    results.add( new Tuple( "1", "a2", "1", "A3" ) );
669    results.add( new Tuple( "1", "a3", "1", "A1" ) );
670    results.add( new Tuple( "1", "a3", "1", "A2" ) );
671    results.add( new Tuple( "1", "a3", "1", "A3" ) );
672    results.add( new Tuple( "2", "b1", "2", "B1" ) );
673    results.add( new Tuple( "2", "b1", "2", "B2" ) );
674    results.add( new Tuple( "2", "b1", "2", "B3" ) );
675    results.add( new Tuple( "3", "c1", null, null ) );
676    results.add( new Tuple( "4", "d1", "4", "D1" ) );
677    results.add( new Tuple( "4", "d2", "4", "D1" ) );
678    results.add( new Tuple( "4", "d3", "4", "D1" ) );
679    results.add( new Tuple( "5", "e1", null, null ) );
680    results.add( new Tuple( "5", "e2", null, null ) );
681    results.add( new Tuple( "5", "e3", null, null ) );
682    results.add( new Tuple( "7", "g1", null, null ) );
683    results.add( new Tuple( "7", "g2", null, null ) );
684    results.add( new Tuple( "7", "g3", null, null ) );
685    results.add( new Tuple( "7", "g4", null, null ) );
686    results.add( new Tuple( "7", "g5", null, null ) );
687    results.add( new Tuple( null, "h1", null, "H1" ) );
688
689    handleJoins( "joininnerouter", new LeftJoin(), results );
690    }
691
692  /**
693   * 1 a1
694   * 1 a2
695   * 1 a3
696   * 2 b1
697   * 3 c1
698   * 4 d1
699   * 4 d2
700   * 4 d3
701   * 5 e1
702   * 5 e2
703   * 5 e3
704   * 7 g1
705   * 7 g2
706   * 7 g3
707   * 7 g4
708   * 7 g5
709   * null h1
710   * <p>
711   * 1 A1
712   * 1 A2
713   * 1 A3
714   * 2 B1
715   * 2 B2
716   * 2 B3
717   * 4 D1
718   * 6 F1
719   * 6 F2
720   * null H1
721   * <p>
722   * 1  a1      1       A1
723   * 1  a1      1       A2
724   * 1  a1      1       A3
725   * 1  a2      1       A1
726   * 1  a2      1       A2
727   * 1  a2      1       A3
728   * 1  a3      1       A1
729   * 1  a3      1       A2
730   * 1  a3      1       A3
731   * 2  b1      2       B1
732   * 2  b1      2       B2
733   * 2  b1      2       B3
734   * 4  d1      4       D1
735   * 4  d2      4       D1
736   * 4  d3      4       D1
737   * null       null    6       F1
738   * null       null    6       F2
739   * null h1    null    H1
740   *
741   * @throws Exception
742   */
743  @Test
744  public void testJoinOuterInner() throws Exception
745    {
746    // skip if hadoop cluster mode, outer joins don't behave the same
747    if( getPlatform().isMapReduce() && getPlatform().isUseCluster() )
748      return;
749
750    Set<Tuple> results = new HashSet<Tuple>();
751
752    results.add( new Tuple( "1", "a1", "1", "A1" ) );
753    results.add( new Tuple( "1", "a1", "1", "A2" ) );
754    results.add( new Tuple( "1", "a1", "1", "A3" ) );
755    results.add( new Tuple( "1", "a2", "1", "A1" ) );
756    results.add( new Tuple( "1", "a2", "1", "A2" ) );
757    results.add( new Tuple( "1", "a2", "1", "A3" ) );
758    results.add( new Tuple( "1", "a3", "1", "A1" ) );
759    results.add( new Tuple( "1", "a3", "1", "A2" ) );
760    results.add( new Tuple( "1", "a3", "1", "A3" ) );
761    results.add( new Tuple( "2", "b1", "2", "B1" ) );
762    results.add( new Tuple( "2", "b1", "2", "B2" ) );
763    results.add( new Tuple( "2", "b1", "2", "B3" ) );
764    results.add( new Tuple( "4", "d1", "4", "D1" ) );
765    results.add( new Tuple( "4", "d2", "4", "D1" ) );
766    results.add( new Tuple( "4", "d3", "4", "D1" ) );
767    results.add( new Tuple( null, null, "6", "F1" ) );
768    results.add( new Tuple( null, null, "6", "F2" ) );
769    results.add( new Tuple( null, "h1", null, "H1" ) );
770
771    handleJoins( "joinouterinner", new RightJoin(), results );
772    }
773
774  private void handleJoins( String path, Joiner joiner, Set<Tuple> results ) throws Exception
775    {
776    getPlatform().copyFromLocal( inputFileLhsSparse );
777    getPlatform().copyFromLocal( inputFileRhsSparse );
778
779    Fields fields = new Fields( "num", "char" ).applyTypes( Integer.class, String.class );
780    Tap sourceLower = getPlatform().getDelimitedFile( fields, " ", inputFileLhsSparse );
781    Tap sourceUpper = getPlatform().getDelimitedFile( fields, " ", inputFileRhsSparse );
782
783    Map sources = new HashMap();
784
785    sources.put( "lower", sourceLower );
786    sources.put( "upper", sourceUpper );
787
788    Tap sink = getPlatform().getDelimitedFile( Fields.size( 4, String.class ), "\t", getOutputPath( path ), SinkMode.REPLACE );
789
790    Pipe pipeLower = new Pipe( "lower" );
791    Pipe pipeUpper = new Pipe( "upper" );
792
793    Fields declaredFields = new Fields( "num", "char", "num2", "char2" );
794    Fields groupingFields = new Fields( "num" );
795
796    Pipe splice = new HashJoin( pipeLower, groupingFields, pipeUpper, groupingFields, declaredFields, joiner );
797
798    splice = new Each( splice, Fields.ALL, new Identity(), Fields.RESULTS );
799
800    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
801
802    flow.complete();
803
804    validateLength( flow, results.size() );
805
806    List<Tuple> actual = getSinkAsList( flow );
807
808    results.removeAll( actual );
809
810    assertEquals( 0, results.size() );
811    }
812
813  /**
814   * 1 a
815   * 5 b
816   * 6 c
817   * 5 b
818   * 5 e
819   * <p>
820   * 1 A
821   * 2 B
822   * 3 C
823   * 4 D
824   * 5 E
825   * <p>
826   * 1 a
827   * 2 b
828   * 3 c
829   * 4 d
830   * 5 e
831   * <p>
832   * 1  a       1       A  1  a
833   * -  -   2   B  2  b
834   * -  -   3   C  3  c
835   * -  -   4   D  4  d
836   * 5  b       5   E  5  e
837   * 5  e       5   E  5  e
838   *
839   * @throws Exception
840   */
841  @Test
842  public void testJoinMixed() throws Exception
843    {
844    // skip if hadoop cluster mode, outer joins don't behave the same
845    if( getPlatform().isMapReduce() && getPlatform().isUseCluster() )
846      return;
847
848    getPlatform().copyFromLocal( inputFileLowerOffset );
849    getPlatform().copyFromLocal( inputFileLower );
850    getPlatform().copyFromLocal( inputFileUpper );
851
852    Tap sourceLowerOffset = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLowerOffset );
853    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
854    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
855
856    Map sources = new HashMap();
857
858    sources.put( "loweroffset", sourceLowerOffset );
859    sources.put( "lower", sourceLower );
860    sources.put( "upper", sourceUpper );
861
862    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinmixed" ), SinkMode.REPLACE );
863
864    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
865
866    Pipe pipeLowerOffset = new Each( new Pipe( "loweroffset" ), new Fields( "line" ), splitter );
867    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
868    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
869
870    Pipe[] pipes = Pipe.pipes( pipeLowerOffset, pipeUpper, pipeLower );
871    Fields[] fields = Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) );
872
873    MixedJoin join = new MixedJoin( new boolean[]{MixedJoin.OUTER, MixedJoin.INNER, MixedJoin.OUTER} );
874    Pipe splice = new HashJoin( pipes, fields, Fields.size( 6 ), join );
875
876    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice );
877
878    flow.complete();
879
880    validateLength( flow, 6 );
881
882    Set<Tuple> results = new HashSet<Tuple>();
883
884    results.add( new Tuple( "1\ta\t1\tA\t1\ta" ) );
885    results.add( new Tuple( "null\tnull\t2\tB\t2\tb" ) );
886    results.add( new Tuple( "null\tnull\t3\tC\t3\tc" ) );
887    results.add( new Tuple( "null\tnull\t4\tD\t4\td" ) );
888    results.add( new Tuple( "5\tb\t5\tE\t5\te" ) );
889    results.add( new Tuple( "5\te\t5\tE\t5\te" ) );
890
891    List<Tuple> actual = getSinkAsList( flow );
892
893    results.removeAll( actual );
894
895    assertEquals( 0, results.size() );
896    }
897
898  @Test
899  public void testJoinDiffFields() throws Exception
900    {
901    getPlatform().copyFromLocal( inputFileLower );
902    getPlatform().copyFromLocal( inputFileUpper );
903
904    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
905    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
906
907    Map sources = new HashMap();
908
909    sources.put( "lower", sourceLower );
910    sources.put( "upper", sourceUpper );
911
912    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "difffields" ), SinkMode.REPLACE );
913
914    Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
915    Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
916
917    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
918    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
919
920    Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
921
922    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe );
923
924    flow.complete();
925
926    validateLength( flow, 5 );
927
928    List<Tuple> actual = getSinkAsList( flow );
929
930    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
931    assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) );
932    }
933
934  @Test
935  public void testJoinGroupBy() throws Exception
936    {
937    getPlatform().copyFromLocal( inputFileLower );
938    getPlatform().copyFromLocal( inputFileUpper );
939
940    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
941    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
942
943    Map sources = new HashMap();
944
945    sources.put( "lower", sourceLower );
946    sources.put( "upper", sourceUpper );
947
948    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupby" ), SinkMode.REPLACE );
949
950    Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
951    Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
952
953    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
954    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
955
956    Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
957
958    Pipe groupby = new GroupBy( pipe, new Fields( "numA" ) );
959
960    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, groupby );
961
962    flow.complete();
963
964    validateLength( flow, 5, null );
965
966    List<Tuple> actual = getSinkAsList( flow );
967
968    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
969    assertTrue( actual.contains( new Tuple( "2\tb\t2\tB" ) ) );
970    }
971
972  @Test
973  public void testJoinSamePipe() throws Exception
974    {
975    getPlatform().copyFromLocal( inputFileLower );
976
977    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
978
979    Map sources = new HashMap();
980
981    sources.put( "lower", source );
982
983    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe" ), SinkMode.REPLACE );
984
985    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
986
987    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
988
989    Pipe pipe = new HashJoin( pipeLower, new Fields( "num" ), 1, new Fields( "num1", "char1", "num2", "char2" ) );
990
991    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe );
992
993    flow.complete();
994
995    validateLength( flow, 5, null );
996
997    List<Tuple> actual = getSinkAsList( flow );
998
999    assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
1000    assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
1001    }
1002
1003  @Test
1004  public void testJoinSamePipe2() throws Exception
1005    {
1006    getPlatform().copyFromLocal( inputFileLower );
1007
1008    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1009
1010    Map sources = new HashMap();
1011
1012    sources.put( "lower", source );
1013
1014    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe2" ), SinkMode.REPLACE );
1015
1016    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1017
1018    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1019
1020    Pipe join = new HashJoin( pipeLower, new Fields( "num" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1021
1022    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
1023
1024    flow.complete();
1025
1026    validateLength( flow, 5, null );
1027
1028    List<Tuple> actual = getSinkAsList( flow );
1029
1030    assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
1031    assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
1032    }
1033
1034  @Test
1035  public void testJoinSamePipe3() throws Exception
1036    {
1037    getPlatform().copyFromLocal( inputFileLower );
1038
1039    Tap source = getPlatform().getDelimitedFile( new Fields( "num", "char" ), " ", inputFileLower );
1040
1041    Map sources = new HashMap();
1042
1043    sources.put( "lower", source );
1044
1045    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipe3" ), SinkMode.REPLACE );
1046
1047    Pipe pipe = new Pipe( "lower" );
1048
1049    Pipe lhs = new Pipe( "lhs", pipe );
1050    Pipe rhs = new Pipe( "rhs", pipe );
1051
1052    Pipe join = new HashJoin( lhs, new Fields( "num" ), rhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1053
1054    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, join );
1055
1056    flow.complete();
1057
1058    validateLength( flow, 5, null );
1059
1060    List<Tuple> actual = getSinkAsList( flow );
1061
1062    assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
1063    assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
1064    }
1065
1066  /**
1067   * Same source as rightmost
1068   * <p>
1069   * should be a single job as the same file accumulates into the joins
1070   *
1071   * @throws Exception
1072   */
1073  @Test
1074  public void testJoinAroundJoinRightMost() throws Exception
1075    {
1076    getPlatform().copyFromLocal( inputFileLower );
1077    getPlatform().copyFromLocal( inputFileUpper );
1078
1079    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1080    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1081
1082    Map sources = new HashMap();
1083
1084    sources.put( "lower", sourceLower );
1085    sources.put( "upper1", sourceUpper );
1086    sources.put( "upper2", sourceUpper );
1087
1088    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinrightmost" ), SinkMode.REPLACE );
1089
1090    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1091
1092    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1093    Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1094    Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1095
1096    Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1097
1098    splice1 = new Each( splice1, new Identity() );
1099
1100    Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1101
1102    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1103
1104//    flow.writeDOT( "joinaroundrightmost.dot" );
1105
1106    if( getPlatform().isMapReduce() )
1107      assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1108
1109    flow.complete();
1110
1111    validateLength( flow, 5, null );
1112
1113    List<Tuple> actual = getSinkAsList( flow );
1114
1115    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) );
1116    assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) );
1117    }
1118
1119  /**
1120   * Same source as leftmost
1121   *
1122   * @throws Exception
1123   */
1124  @Test
1125  public void testJoinAroundJoinLeftMost() throws Exception
1126    {
1127    getPlatform().copyFromLocal( inputFileLower );
1128    getPlatform().copyFromLocal( inputFileUpper );
1129
1130    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1131    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1132
1133    Map sources = new HashMap();
1134
1135    sources.put( "lower", sourceLower );
1136    sources.put( "upper1", sourceUpper );
1137    sources.put( "upper2", sourceUpper );
1138
1139    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinleftmost" ), SinkMode.REPLACE );
1140
1141    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1142
1143    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1144    Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1145    Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1146
1147    Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1148
1149    splice1 = new Each( splice1, new Identity() );
1150
1151    Pipe splice2 = new HashJoin( splice1, new Fields( "num1" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1152
1153    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1154
1155//    flow.writeDOT( "joinaroundleftmost.dot" );
1156
1157    if( getPlatform().isMapReduce() )
1158      assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1159
1160    flow.complete();
1161
1162    validateLength( flow, 5, null );
1163
1164    List<Tuple> actual = getSinkAsList( flow );
1165
1166    assertTrue( actual.contains( new Tuple( "1\tA\t1\tA\t1\ta" ) ) );
1167    assertTrue( actual.contains( new Tuple( "2\tB\t2\tB\t2\tb" ) ) );
1168    }
1169
1170  /**
1171   * Upper as leftmost and rightmost forcing two jobs
1172   *
1173   * @throws Exception
1174   */
1175  @Test
1176  public void testJoinAroundJoinRightMostSwapped() throws Exception
1177    {
1178    getPlatform().copyFromLocal( inputFileLower );
1179    getPlatform().copyFromLocal( inputFileUpper );
1180
1181    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1182    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1183
1184    Map sources = new HashMap();
1185
1186    sources.put( "lower", sourceLower );
1187    sources.put( "upper1", sourceUpper );
1188    sources.put( "upper2", sourceUpper );
1189
1190    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinaroundjoinswapped" ), SinkMode.REPLACE );
1191
1192    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1193
1194    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1195    Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1196    Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1197
1198    Pipe splice1 = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1199
1200    splice1 = new Each( splice1, new Identity() );
1201
1202    // upper2 becomes leftmost, forcing a tap between the joins
1203    Pipe splice2 = new HashJoin( pipeUpper2, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1204
1205    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1206
1207    if( getPlatform().isMapReduce() )
1208      assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1209
1210    flow.complete();
1211
1212    validateLength( flow, 5, null );
1213
1214    List<Tuple> actual = getSinkAsList( flow );
1215
1216    assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\tA" ) ) );
1217    assertTrue( actual.contains( new Tuple( "2\tB\t2\tb\t2\tB" ) ) );
1218    }
1219
1220  @Test
1221  public void testJoinGroupByJoin() throws Exception
1222    {
1223    getPlatform().copyFromLocal( inputFileLower );
1224    getPlatform().copyFromLocal( inputFileUpper );
1225    getPlatform().copyFromLocal( inputFileJoined );
1226
1227    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1228    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1229    Tap sourceJoined = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileJoined );
1230
1231    Map sources = new HashMap();
1232
1233    sources.put( "lower", sourceLower );
1234    sources.put( "upper", sourceUpper );
1235    sources.put( "joined", sourceJoined );
1236
1237    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joingroupbyjoin" ), SinkMode.REPLACE );
1238
1239    Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
1240    Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
1241    Function splitterJoined = new RegexSplitter( new Fields( "numC", "lowerC", "upperC" ), "\t" );
1242
1243    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
1244    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
1245    Pipe pipeJoined = new Each( new Pipe( "joined" ), new Fields( "line" ), splitterJoined );
1246
1247    Pipe pipe = new HashJoin( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
1248
1249    pipe = new GroupBy( pipe, new Fields( "numA" ) );
1250
1251    pipe = new HashJoin( pipe, new Fields( "numA" ), pipeJoined, new Fields( "numC" ) );
1252
1253    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, pipe );
1254
1255    if( getPlatform().isMapReduce() )
1256      assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1257
1258    flow.complete();
1259
1260    validateLength( flow, 5, null );
1261
1262    List<Tuple> actual = getSinkAsList( flow );
1263
1264    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\tA" ) ) );
1265    assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tb\tB" ) ) );
1266    }
1267
1268  /**
1269   * here the same file is fed into the same HashJoin.
1270   * <p>
1271   * This is three jobs.
1272   * <p>
1273   * a temp tap is inserted before the accumulated branch for two reasons on the common HashJoin
1274   * <p>
1275   * it is assumed the accumulated side is filtered down, so pushing to disk will preserve io
1276   * if accumulated side was streamed instead via a fork, only part of the file will accumulate into the HashJoin
1277   * <p>
1278   * /-T-\ <-- accumulated
1279   * T      HJ
1280   * \---/ <-- streamed
1281   *
1282   * @throws Exception
1283   */
1284  @Test
1285  public void testJoinSameSourceIntoJoin() throws Exception
1286    {
1287    getPlatform().copyFromLocal( inputFileLower );
1288    getPlatform().copyFromLocal( inputFileUpper );
1289
1290    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1291    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1292
1293    Map sources = new HashMap();
1294
1295    sources.put( "lower", sourceLower );
1296    sources.put( "upper1", sourceUpper );
1297    sources.put( "upper2", sourceUpper );
1298
1299    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceintojoin" ), SinkMode.REPLACE );
1300
1301    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1302
1303    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1304    Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1305    Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1306
1307    Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1308
1309    splice1 = new Each( splice1, new Identity() );
1310
1311    Pipe splice2 = new HashJoin( pipeLower, new Fields( "num" ), splice1, new Fields( "num1" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
1312
1313    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 );
1314
1315//    flow.writeDOT( "joinsamesourceintojoin.dot" );
1316
1317    if( getPlatform().isMapReduce() )
1318      assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() );
1319
1320    flow.complete();
1321
1322    validateLength( flow, 5, null );
1323
1324    List<Tuple> actual = getSinkAsList( flow );
1325
1326    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) );
1327    assertTrue( actual.contains( new Tuple( "2\tb\t2\tB\t2\tB" ) ) );
1328    }
1329
1330  @Test
1331  public void testJoinSameSourceIntoJoinSimple() throws Exception
1332    {
1333    getPlatform().copyFromLocal( inputFileUpper );
1334
1335    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1336
1337    Map sources = new HashMap();
1338
1339    sources.put( "upper1", sourceUpper );
1340    sources.put( "upper2", sourceUpper );
1341
1342    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceintojoinsimple" ), SinkMode.REPLACE );
1343
1344    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1345
1346    Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1347    Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1348
1349    Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1350
1351    splice1 = new Each( splice1, new Identity() );
1352
1353    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice1 );
1354
1355//    flow.writeDOT( "joinsamesourceintojoin.dot" );
1356
1357    if( getPlatform().isMapReduce() )
1358      assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1359
1360    flow.complete();
1361
1362    validateLength( flow, 5, null );
1363
1364    List<Tuple> actual = getSinkAsList( flow );
1365
1366    assertTrue( actual.contains( new Tuple( "1\tA\t1\tA" ) ) );
1367    assertTrue( actual.contains( new Tuple( "2\tB\t2\tB" ) ) );
1368    }
1369
1370  /**
1371   * Loosely tests for a deadlock when BlockingHashJoinAnnotator rule doesn't excluce the GroupBy from the blocking
1372   * annotation.
1373   * <p>
1374   * the deadlock is random on the order of the paths traversed from the Source Tap + fork.
1375   *
1376   * @throws Exception
1377   */
1378  @Test
1379  public void testJoinSameSourceOverGroupByIntoJoinSimple() throws Exception
1380    {
1381    getPlatform().copyFromLocal( inputFileLower );
1382    getPlatform().copyFromLocal( inputFileUpper );
1383
1384    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1385
1386    Map sources = new HashMap();
1387
1388    sources.put( "upper1", sourceUpper );
1389    sources.put( "upper2", sourceUpper );
1390
1391    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsamesourceovergroupbyintojoinsimple" ), SinkMode.REPLACE );
1392
1393    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1394
1395    Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
1396    Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
1397
1398    pipeUpper1 = new GroupBy( pipeUpper1, new Fields( "num" ) );
1399    pipeUpper2 = new GroupBy( pipeUpper2, new Fields( "num" ) );
1400
1401    Pipe splice1 = new HashJoin( pipeUpper1, new Fields( "num" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1402
1403    splice1 = new Each( splice1, new Identity() );
1404
1405    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice1 );
1406
1407    if( getPlatform().isMapReduce() )
1408      assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() );
1409
1410    flow.complete();
1411
1412    validateLength( flow, 5, null );
1413
1414    List<Tuple> actual = getSinkAsList( flow );
1415
1416    assertTrue( actual.contains( new Tuple( "1\tA\t1\tA" ) ) );
1417    assertTrue( actual.contains( new Tuple( "2\tB\t2\tB" ) ) );
1418    }
1419
1420  /**
1421   * Tests that two independent streamed sources with loadable tributaries properly plan into a GroupBy
1422   * without loading unused sources
1423   *
1424   * @throws Exception
1425   */
1426  @Test
1427  public void testJoinsIntoGroupBy() throws Exception
1428    {
1429    getPlatform().copyFromLocal( inputFileLower );
1430    getPlatform().copyFromLocal( inputFileUpper );
1431
1432    getPlatform().copyFromLocal( inputFileLhs );
1433    getPlatform().copyFromLocal( inputFileRhs );
1434
1435    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1436    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1437
1438    Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1439    Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1440
1441    Map sources = new HashMap();
1442
1443    sources.put( "lower", sourceLower );
1444    sources.put( "upper", sourceUpper );
1445    sources.put( "lhs", sourceLhs );
1446    sources.put( "rhs", sourceRhs );
1447
1448    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintogroupby" ), SinkMode.REPLACE );
1449
1450    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1451
1452    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1453    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1454
1455    Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1456    Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1457
1458    Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1459
1460    upperLower = new Each( upperLower, new Identity() );
1461
1462    Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1463
1464    lhsRhs = new Each( lhsRhs, new Identity() );
1465
1466    Pipe grouped = new GroupBy( "merging", Pipe.pipes( upperLower, lhsRhs ), new Fields( "num1" ) );
1467
1468    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1469
1470    if( getPlatform().isMapReduce() )
1471      assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1472
1473    flow.complete();
1474
1475    validateLength( flow, 42, null );
1476
1477    List<Tuple> actual = getSinkAsList( flow );
1478
1479    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA" ) ) );
1480    assertTrue( actual.contains( new Tuple( "5\te\t5\tE" ) ) );
1481    }
1482
1483  @Test
1484  public void testJoinSamePipeAroundGroupBy() throws Exception
1485    {
1486    getPlatform().copyFromLocal( inputFileLower );
1487
1488    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1489    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "samepipearoundgroupby" ), SinkMode.REPLACE );
1490
1491    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1492
1493    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1494
1495    Pipe lhsPipe = new Each( new Pipe( "lhs", pipeLower ), new Identity() );
1496
1497    Pipe rhsPipe = new Each( new Pipe( "rhs", pipeLower ), new Identity() );
1498
1499    rhsPipe = new GroupBy( rhsPipe, new Fields( "num" ) );
1500
1501    rhsPipe = new Each( rhsPipe, new Identity() );
1502
1503    Pipe pipe = new HashJoin( lhsPipe, new Fields( "num" ), rhsPipe, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
1504
1505    Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe );
1506
1507    flow.complete();
1508
1509    validateLength( flow, 5, null );
1510
1511    List<Tuple> actual = getSinkAsList( flow );
1512
1513    assertTrue( actual.contains( new Tuple( "1\ta\t1\ta" ) ) );
1514    assertTrue( actual.contains( new Tuple( "2\tb\t2\tb" ) ) );
1515    }
1516
1517  /**
1518   * This test results in two MR jobs because one join feeds into the accumulated side of the second. A mapper
1519   * can only stream on branch at a time forcing a temp file between the mappers. see next test for swapped join
1520   *
1521   * @throws Exception
1522   */
1523  @Test
1524  public void testJoinsIntoCoGroupLhs() throws Exception
1525    {
1526    getPlatform().copyFromLocal( inputFileLower );
1527    getPlatform().copyFromLocal( inputFileUpper );
1528
1529    getPlatform().copyFromLocal( inputFileLhs );
1530    getPlatform().copyFromLocal( inputFileRhs );
1531
1532    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1533    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1534
1535    Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1536    Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1537
1538    Map sources = new HashMap();
1539
1540    sources.put( "lower", sourceLower );
1541    sources.put( "upper", sourceUpper );
1542    sources.put( "lhs", sourceLhs );
1543    sources.put( "rhs", sourceRhs );
1544
1545    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhs" ), SinkMode.REPLACE );
1546
1547    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1548
1549    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1550    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1551
1552    Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1553    Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1554
1555    Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1556
1557    upperLower = new Each( upperLower, new Identity() );
1558
1559    Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1560
1561    lhsUpperLower = new Each( lhsUpperLower, new Identity() );
1562
1563    Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) );
1564
1565    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1566
1567    if( getPlatform().isMapReduce() )
1568      assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1569
1570    flow.complete();
1571
1572    validateLength( flow, 37, null );
1573
1574    List<Tuple> actual = getSinkAsList( flow );
1575
1576    assertTrue( actual.contains( new Tuple( "1\ta\t1\ta\t1\tA\t1\tA" ) ) );
1577    assertTrue( actual.contains( new Tuple( "5\ta\t5\te\t5\tE\t5\tA" ) ) );
1578    }
1579
1580  /**
1581   * This test results in one MR jobs because one join feeds into the streamed side of the second.
1582   *
1583   * @throws Exception
1584   */
1585  @Test
1586  public void testJoinsIntoCoGroupLhsSwappedJoin() throws Exception
1587    {
1588    getPlatform().copyFromLocal( inputFileLower );
1589    getPlatform().copyFromLocal( inputFileUpper );
1590
1591    getPlatform().copyFromLocal( inputFileLhs );
1592    getPlatform().copyFromLocal( inputFileRhs );
1593
1594    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1595    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1596
1597    Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1598    Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1599
1600    Map sources = new HashMap();
1601
1602    sources.put( "lower", sourceLower );
1603    sources.put( "upper", sourceUpper );
1604    sources.put( "lhs", sourceLhs );
1605    sources.put( "rhs", sourceRhs );
1606
1607    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouplhsswappedjoin" ), SinkMode.REPLACE );
1608
1609    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1610
1611    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1612    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1613
1614    Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1615    Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1616
1617    Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1618
1619    upperLower = new Each( upperLower, new Identity() );
1620
1621    Pipe lhsUpperLower = new HashJoin( upperLower, new Fields( "numUpperLower" ), pipeLhs, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower", "numLhs", "charLhs" ) );
1622
1623    lhsUpperLower = new Each( lhsUpperLower, new Identity() );
1624
1625    Pipe grouped = new CoGroup( "cogrouping", lhsUpperLower, new Fields( "numLhs" ), pipeRhs, new Fields( "num" ) );
1626
1627    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1628
1629    if( getPlatform().isMapReduce() )
1630      assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1631
1632    flow.complete();
1633
1634    validateLength( flow, 37, null );
1635
1636    List<Tuple> actual = getSinkAsList( flow );
1637
1638    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) );
1639    assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) );
1640    }
1641
1642  @Test
1643  public void testJoinsIntoCoGroupRhs() throws Exception
1644    {
1645    getPlatform().copyFromLocal( inputFileLower );
1646    getPlatform().copyFromLocal( inputFileUpper );
1647
1648    getPlatform().copyFromLocal( inputFileLhs );
1649    getPlatform().copyFromLocal( inputFileRhs );
1650
1651    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1652    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1653
1654    Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1655    Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1656
1657    Map sources = new HashMap();
1658
1659    sources.put( "lower", sourceLower );
1660    sources.put( "upper", sourceUpper );
1661    sources.put( "lhs", sourceLhs );
1662    sources.put( "rhs", sourceRhs );
1663
1664    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogrouprhs" ), SinkMode.REPLACE );
1665
1666    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1667
1668    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1669    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1670
1671    Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1672    Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1673
1674    Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1675
1676    upperLower = new Each( upperLower, new Identity() );
1677
1678    Pipe lhsUpperLower = new HashJoin( pipeLhs, new Fields( "num" ), upperLower, new Fields( "numUpperLower" ), new Fields( "numLhs", "charLhs", "numUpperLower", "charUpperLower", "num2UpperLower", "char2UpperLower" ) );
1679
1680    lhsUpperLower = new Each( lhsUpperLower, new Identity() );
1681
1682    Pipe grouped = new CoGroup( "cogrouping", pipeRhs, new Fields( "num" ), lhsUpperLower, new Fields( "numLhs" ) );
1683
1684    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1685
1686    if( getPlatform().isMapReduce() )
1687      assertEquals( "wrong number of steps", 2, flow.getFlowSteps().size() );
1688
1689    flow.complete();
1690
1691    validateLength( flow, 37, null );
1692
1693    List<Tuple> actual = getSinkAsList( flow );
1694
1695    assertTrue( actual.contains( new Tuple( "1\tA\t1\ta\t1\ta\t1\tA" ) ) );
1696    assertTrue( actual.contains( new Tuple( "5\tE\t5\te\t5\te\t5\tE" ) ) );
1697    }
1698
1699  @Test
1700  public void testJoinsIntoCoGroup() throws Exception
1701    {
1702    getPlatform().copyFromLocal( inputFileLower );
1703    getPlatform().copyFromLocal( inputFileUpper );
1704
1705    getPlatform().copyFromLocal( inputFileLhs );
1706    getPlatform().copyFromLocal( inputFileRhs );
1707
1708    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1709    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1710
1711    Tap sourceLhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLhs );
1712    Tap sourceRhs = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileRhs );
1713
1714    Map sources = new HashMap();
1715
1716    sources.put( "lower", sourceLower );
1717    sources.put( "upper", sourceUpper );
1718    sources.put( "lhs", sourceLhs );
1719    sources.put( "rhs", sourceRhs );
1720
1721    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinsintocogroup" ), SinkMode.REPLACE );
1722
1723    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1724
1725    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1726    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1727
1728    Pipe pipeLhs = new Each( new Pipe( "lhs" ), new Fields( "line" ), splitter );
1729    Pipe pipeRhs = new Each( new Pipe( "rhs" ), new Fields( "line" ), splitter );
1730
1731    Pipe upperLower = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "numUpperLower1", "charUpperLower1", "numUpperLower2", "charUpperLower2" ) );
1732
1733    upperLower = new Each( upperLower, new Identity() );
1734
1735    Pipe lhsRhs = new HashJoin( pipeLhs, new Fields( "num" ), pipeRhs, new Fields( "num" ), new Fields( "numLhsRhs1", "charLhsRhs1", "numLhsRhs2", "charLhsRhs2" ) );
1736
1737    lhsRhs = new Each( lhsRhs, new Identity() );
1738
1739    Pipe grouped = new CoGroup( "cogrouping", upperLower, new Fields( "numUpperLower1" ), lhsRhs, new Fields( "numLhsRhs1" ) );
1740
1741    Flow flow = getPlatform().getFlowConnector().connect( sources, sink, grouped );
1742
1743    if( getPlatform().isMapReduce() )
1744      assertEquals( "wrong number of steps", 1, flow.getFlowSteps().size() );
1745
1746    flow.complete();
1747
1748    validateLength( flow, 37, null );
1749
1750    List<Tuple> actual = getSinkAsList( flow );
1751
1752    assertTrue( actual.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA" ) ) );
1753    assertTrue( actual.contains( new Tuple( "5\te\t5\tE\t5\te\t5\tE" ) ) );
1754    }
1755
1756  public static class AllComparator implements Comparator<Comparable>, Hasher<Comparable>, Serializable
1757    {
1758
1759    @Override
1760    public int compare( Comparable lhs, Comparable rhs )
1761      {
1762      return lhs.toString().compareTo( rhs.toString() );
1763      }
1764
1765    @Override
1766    public int hashCode( Comparable value )
1767      {
1768      if( value == null )
1769        return 0;
1770
1771      return value.toString().hashCode();
1772      }
1773    }
1774
1775  /**
1776   * Tests Hasher being honored even if default comparator is null.
1777   *
1778   * @throws Exception
1779   */
1780  @Test
1781  public void testJoinWithHasher() throws Exception
1782    {
1783    getPlatform().copyFromLocal( inputFileLower );
1784    getPlatform().copyFromLocal( inputFileUpper );
1785
1786    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1787    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1788
1789    Map sources = new HashMap();
1790
1791    sources.put( "lower", sourceLower );
1792    sources.put( "upper", sourceUpper );
1793
1794    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinhasher" ), SinkMode.REPLACE );
1795
1796    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1797
1798    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1799
1800    pipeLower = new Each( pipeLower, new Fields( "num" ), new ExpressionFunction( Fields.ARGS, "Integer.parseInt( num )", String.class ), Fields.REPLACE );
1801
1802    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1803
1804    Fields num = new Fields( "num" );
1805    num.setComparator( "num", new AllComparator() );
1806
1807    Pipe splice = new HashJoin( pipeLower, num, pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
1808
1809    Map<Object, Object> properties = getProperties();
1810
1811    Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
1812
1813    flow.complete();
1814
1815    validateLength( flow, 5 );
1816
1817    List<Tuple> values = getSinkAsList( flow );
1818
1819    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
1820    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
1821    }
1822
1823  @Test
1824  public void testJoinNone() throws Exception
1825    {
1826    getPlatform().copyFromLocal( inputFileLower );
1827    getPlatform().copyFromLocal( inputFileUpper );
1828
1829    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1830    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1831
1832    Map sources = new HashMap();
1833
1834    sources.put( "lower", sourceLower );
1835    sources.put( "upper", sourceUpper );
1836
1837    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "joinnone" ), SinkMode.REPLACE );
1838
1839    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
1840
1841    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
1842    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
1843
1844    Pipe splice = new HashJoin( pipeLower, Fields.NONE, pipeUpper, Fields.NONE, Fields.size( 4 ) );
1845
1846    Map<Object, Object> properties = getProperties();
1847
1848    Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
1849
1850    flow.complete();
1851
1852    validateLength( flow, 25 );
1853
1854    List<Tuple> values = getSinkAsList( flow );
1855
1856    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
1857    assertTrue( values.contains( new Tuple( "1\ta\t2\tB" ) ) );
1858    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
1859    }
1860
1861  @Test
1862  public void testGroupBySplitJoins() throws Exception
1863    {
1864    getPlatform().copyFromLocal( inputFileLower );
1865    getPlatform().copyFromLocal( inputFileUpper );
1866    getPlatform().copyFromLocal( inputFileJoined );
1867
1868    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
1869    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
1870    Tap sourceJoined = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileJoined );
1871
1872    Map sources = new HashMap();
1873
1874    sources.put( "lower", sourceLower );
1875    sources.put( "upper", sourceUpper );
1876    sources.put( "joined", sourceJoined );
1877
1878    Tap lhsSink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE );
1879    Tap rhsSink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE );
1880
1881    Map sinks = new HashMap();
1882
1883    sinks.put( "lhs", lhsSink );
1884    sinks.put( "rhs", rhsSink );
1885
1886    Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
1887    Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
1888    Function splitterJoined = new RegexSplitter( new Fields( "numC", "lowerC", "upperC" ), "\t" );
1889
1890    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
1891    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
1892    Pipe pipeJoined = new Each( new Pipe( "joined" ), new Fields( "line" ), splitterJoined );
1893
1894    Pipe pipe = new GroupBy( pipeLower, new Fields( "numA" ) );
1895
1896    pipe = new Every( pipe, Fields.ALL, new TestIdentityBuffer( new Fields( "numA" ), 5, false ), Fields.RESULTS );
1897
1898    Pipe lhsPipe = new Each( pipe, new Identity() );
1899    lhsPipe = new HashJoin( "lhs", lhsPipe, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
1900
1901    Pipe rhsPipe = new Each( pipe, new Identity() );
1902    rhsPipe = new HashJoin( "rhs", rhsPipe, new Fields( "numA" ), pipeJoined, new Fields( "numC" ) );
1903
1904    Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, lhsPipe, rhsPipe );
1905
1906    if( getPlatform().isMapReduce() )
1907      assertEquals( "wrong number of steps", 3, flow.getFlowSteps().size() );
1908
1909    flow.complete();
1910
1911    validateLength( flow.openSink( "lhs" ), 5, null );
1912    validateLength( flow.openSink( "rhs" ), 5, null );
1913
1914    List<Tuple> lhsActual = asList( flow, lhsSink );
1915
1916    assertTrue( lhsActual.contains( new Tuple( "1\ta\t1\tA" ) ) );
1917    assertTrue( lhsActual.contains( new Tuple( "2\tb\t2\tB" ) ) );
1918
1919    List<Tuple> rhsActual = asList( flow, rhsSink );
1920
1921    assertTrue( rhsActual.contains( new Tuple( "1\ta\t1\ta\tA" ) ) );
1922    assertTrue( rhsActual.contains( new Tuple( "2\tb\t2\tb\tB" ) ) );
1923    }
1924
1925  /**
1926   * When run against a cluster a Merge before a GroupBy can hide the streamed/accumulated nature of a branch.
1927   * <p>
1928   * The planner nw
1929   * <p>
1930   * commented code is for troubleshooting.
1931   *
1932   * @throws Exception
1933   */
1934  @Test
1935  public void testJoinMergeGroupBy() throws Exception
1936    {
1937    getPlatform().copyFromLocal( inputFileNums10 );
1938    getPlatform().copyFromLocal( inputFileNums20 );
1939
1940    Tap lhsTap = getPlatform().getTextFile( new Fields( "id" ), inputFileNums10 );
1941    Tap rhsTap = getPlatform().getTextFile( new Fields( "id2" ), inputFileNums20 );
1942
1943    Pipe lhs = new Pipe( "lhs" );
1944    Pipe rhs = new Pipe( "rhs" );
1945
1946//    Pipe joined = new CoGroup( messages, new Fields( "id" ), people, new Fields( "id2" ) );
1947    Pipe joined = new HashJoin( lhs, new Fields( "id" ), rhs, new Fields( "id2" ) );
1948
1949    Pipe pruned = new Each( joined, new Fields( "id2" ), new Identity(), Fields.RESULTS );
1950//    pruned = new Checkpoint( pruned );
1951    Pipe merged = new Merge( pruned, rhs );
1952    Pipe grouped = new GroupBy( merged, new Fields( "id2" ) );
1953//    Pipe grouped = new GroupBy( Pipe.pipes(  pruned, people  ), new Fields( "id2" ) );
1954    Aggregator count = new Count( new Fields( "count" ) );
1955    Pipe counted = new Every( grouped, count );
1956
1957    String testJoinMerge = "testJoinMergeGroupBy/" + ( ( joined instanceof CoGroup ) ? "cogroup" : "hashjoin" );
1958    Tap sink = getPlatform().getDelimitedFile( Fields.ALL, true, "\t", null, getOutputPath( testJoinMerge ), SinkMode.REPLACE );
1959
1960    FlowDef flowDef = FlowDef.flowDef()
1961      .setName( "join-merge" )
1962      .addSource( rhs, rhsTap )
1963      .addSource( lhs, lhsTap )
1964      .addTailSink( counted, sink );
1965
1966    Flow flow = getPlatform().getFlowConnector().connect( flowDef );
1967
1968//    flow.writeDOT( "joinmerge.dot" );
1969//    flow.writeStepsDOT( "joinmerge-steps.dot" );
1970
1971    flow.complete();
1972
1973    validateLength( flow, 20 );
1974
1975    List<Tuple> values = getSinkAsList( flow );
1976    List<Tuple> expected = new ArrayList<Tuple>();
1977
1978    expected.add( new Tuple( "1", "2" ) );
1979    expected.add( new Tuple( "10", "2" ) );
1980    expected.add( new Tuple( "11", "1" ) );
1981    expected.add( new Tuple( "12", "1" ) );
1982    expected.add( new Tuple( "13", "1" ) );
1983    expected.add( new Tuple( "14", "1" ) );
1984    expected.add( new Tuple( "15", "1" ) );
1985    expected.add( new Tuple( "16", "1" ) );
1986    expected.add( new Tuple( "17", "1" ) );
1987    expected.add( new Tuple( "18", "1" ) );
1988    expected.add( new Tuple( "19", "1" ) );
1989    expected.add( new Tuple( "2", "2" ) );
1990    expected.add( new Tuple( "20", "1" ) );
1991    expected.add( new Tuple( "3", "2" ) );
1992    expected.add( new Tuple( "4", "2" ) );
1993    expected.add( new Tuple( "5", "2" ) );
1994    expected.add( new Tuple( "6", "2" ) );
1995    expected.add( new Tuple( "7", "2" ) );
1996    expected.add( new Tuple( "8", "2" ) );
1997    expected.add( new Tuple( "9", "2" ) );
1998
1999    Collections.sort( values );
2000    Collections.sort( expected );
2001
2002    assertEquals( expected, values );
2003    }
2004
2005  /**
2006   * Under tez, this can result in the HashJoin being duplicated across nodes for each split after the HashJoin
2007   * BoundaryBalanceJoinSplitTransformer inserts a Boundary at the split, preventing duplication of the path
2008   *
2009   * @throws Exception
2010   */
2011  @Test
2012  public void testJoinSplit() throws Exception
2013    {
2014    getPlatform().copyFromLocal( inputFileLhs );
2015    getPlatform().copyFromLocal( inputFileRhs );
2016
2017    FlowDef flowDef = FlowDef.flowDef()
2018      .addSource( "lhs", getPlatform().getTextFile( inputFileLhs ) )
2019      .addSource( "rhs", getPlatform().getTextFile( inputFileRhs ) )
2020      .addSink( "lhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ) )
2021      .addSink( "rhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ) );
2022
2023    Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
2024    Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
2025
2026    Pipe join = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
2027
2028    Pipe pipeLhs = new Each( new Pipe( "lhsSink", join ), new Identity() );
2029    Pipe pipeRhs = new Each( new Pipe( "rhsSink", join ), new Identity() );
2030
2031    flowDef
2032      .addTail( pipeLhs )
2033      .addTail( pipeRhs );
2034
2035    Flow flow = getPlatform().getFlowConnector().connect( flowDef );
2036
2037    flow.complete();
2038
2039    validateLength( flow, 37, null );
2040
2041    List<Tuple> values = asList( flow, flowDef.getSinks().get( "lhsSink" ) );
2042
2043    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
2044    assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) );
2045
2046    values = asList( flow, flowDef.getSinks().get( "rhsSink" ) );
2047
2048    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
2049    assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) );
2050    }
2051
2052  /**
2053   * catches a situation where BottomUpJoinedBoundariesNodePartitioner may capture an invalid HashJoin sub-graph
2054   * if the in-bound Boundary is split upon.
2055   */
2056  @Test
2057  public void testSameSourceJoinSplitIntoJoin() throws Exception
2058    {
2059    getPlatform().copyFromLocal( inputFileLhs );
2060    getPlatform().copyFromLocal( inputFileRhs );
2061
2062    FlowDef flowDef = FlowDef.flowDef()
2063      .addSource( "lhs", getPlatform().getTextFile( inputFileLhs ) )
2064      .addSource( "rhs", getPlatform().getTextFile( inputFileLhs ) )
2065      .addSource( "joinSecond", getPlatform().getTextFile( inputFileRhs ) )
2066      .addSink( "lhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ) )
2067      .addSink( "rhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ) );
2068
2069    Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
2070    Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
2071
2072    Pipe joinFirst = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
2073
2074    Pipe pipeLhs = new Each( new Pipe( "lhsSink", joinFirst ), new Identity() );
2075
2076    Pipe joinSecond = new Each( "joinSecond", new Fields( "line" ), new RegexSplitter( new Fields( "numRHSSecond", "charRHSSecond" ), " " ) );
2077
2078    joinSecond = new HashJoin( joinFirst, new Fields( "numLHS" ), joinSecond, new Fields( "numRHSSecond" ) );
2079
2080    Pipe pipeRhs = new Each( new Pipe( "rhsSink", joinSecond ), new Identity() );
2081
2082    flowDef
2083      .addTail( pipeLhs )
2084      .addTail( pipeRhs );
2085
2086    Flow flow = getPlatform().getFlowConnector().connect( flowDef );
2087
2088    flow.complete();
2089
2090    List<Tuple> values = asList( flow, flowDef.getSinks().get( "lhsSink" ) );
2091
2092    assertEquals( 37, values.size() );
2093    assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) );
2094    assertTrue( values.contains( new Tuple( "1\ta\t1\tb" ) ) );
2095
2096    values = asList( flow, flowDef.getSinks().get( "rhsSink" ) );
2097
2098    assertEquals( 109, values.size() );
2099    assertTrue( values.contains( new Tuple( "1\ta\t1\ta\t1\tA" ) ) );
2100    assertTrue( values.contains( new Tuple( "1\ta\t1\tb\t1\tB" ) ) );
2101    }
2102
2103  /**
2104   * checks that a split after a HashJoin does not result in the HashJoin execution being duplicated across
2105   * multiple nodes, one for each branch in the split.
2106   */
2107  @Test
2108  public void testJoinSplitBeforeJoin() throws Exception
2109    {
2110    getPlatform().copyFromLocal( inputFileLhs );
2111    getPlatform().copyFromLocal( inputFileRhs );
2112
2113    FlowDef flowDef = FlowDef.flowDef()
2114      .addSource( "lhs", getPlatform().getTextFile( inputFileLhs ) )
2115      .addSource( "rhs", getPlatform().getTextFile( inputFileRhs ) )
2116      .addSource( "joinSecond", getPlatform().getTextFile( inputFileRhs ) )
2117      .addSink( "lhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "lhs" ), SinkMode.REPLACE ) )
2118      .addSink( "rhsSink", getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "rhs" ), SinkMode.REPLACE ) );
2119
2120    Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
2121    Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
2122
2123    pipeUpper = new Checkpoint( pipeUpper );
2124
2125    HashJoin hashJoin = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
2126
2127    Pipe joinFirst = hashJoin;
2128
2129    joinFirst = new Each( joinFirst, new Identity() );
2130
2131    Pipe pipeLhs = new Each( new Pipe( "lhsSink", joinFirst ), new Identity() );
2132
2133    pipeLhs = new GroupBy( pipeLhs, new Fields( "numLHS" ) );
2134
2135    joinFirst = new Each( new Pipe( "lhsSplit", joinFirst ), new Identity() );
2136
2137    Pipe joinSecond = new Each( "joinSecond", new Fields( "line" ), new RegexSplitter( new Fields( "numRHSSecond", "charRHSSecond" ), " " ) );
2138
2139    joinSecond = new CoGroup( joinFirst, new Fields( "numLHS" ), joinSecond, new Fields( "numRHSSecond" ) );
2140
2141    Pipe pipeRhs = new Each( new Pipe( "rhsSink", joinSecond ), new Identity() );
2142
2143    flowDef
2144      .addTail( pipeLhs )
2145      .addTail( pipeRhs );
2146
2147    Flow flow = getPlatform().getFlowConnector().connect( flowDef );
2148
2149    if( getPlatform().isDAG() )
2150      {
2151      FlowStep flowStep = (FlowStep) flow.getFlowSteps().get( 0 );
2152      List<ElementGraph> elementGraphs = flowStep.getFlowNodeGraph().getElementGraphs( hashJoin );
2153
2154      assertEquals( 1, elementGraphs.size() );
2155      }
2156
2157    flow.complete();
2158
2159    List<Tuple> values = asList( flow, flowDef.getSinks().get( "lhsSink" ) );
2160
2161    assertEquals( 37, values.size() );
2162    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
2163    assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) );
2164
2165    values = asList( flow, flowDef.getSinks().get( "rhsSink" ) );
2166
2167    assertEquals( 109, values.size() );
2168    assertTrue( values.contains( new Tuple( "1\ta\t1\tA\t1\tA" ) ) );
2169    assertTrue( values.contains( new Tuple( "1\ta\t1\tB\t1\tB" ) ) );
2170    }
2171
2172  @Test
2173  public void testGroupBySplitGroupByJoin() throws Exception
2174    {
2175    getPlatform().copyFromLocal( inputFileLower );
2176
2177    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
2178
2179    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink" ), SinkMode.REPLACE );
2180
2181    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
2182
2183    Pipe pipeFirst = new Pipe( "first" );
2184    pipeFirst = new Each( pipeFirst, new Fields( "line" ), splitter );
2185    pipeFirst = new GroupBy( pipeFirst, new Fields( "num" ) );
2186    pipeFirst = new Every( pipeFirst, new Fields( "char" ), new First( new Fields( "firstFirst" ) ), Fields.ALL );
2187
2188    Pipe pipeSecond = new Pipe( "second", pipeFirst );
2189    pipeSecond = new Each( pipeSecond, new Identity() );
2190    pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) );
2191    pipeSecond = new Every( pipeSecond, new Fields( "firstFirst" ), new First( new Fields( "secondFirst" ) ), Fields.ALL );
2192    pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) );
2193    pipeSecond = new Every( pipeSecond, new Fields( "secondFirst" ), new First( new Fields( "thirdFirst" ) ), Fields.ALL );
2194
2195    Pipe splice = new HashJoin( pipeFirst, new Fields( "num" ), pipeSecond, new Fields( "num" ), Fields.size( 4 ) );
2196
2197    Flow flow = getPlatform().getFlowConnector().connect( source, sink, splice );
2198
2199    flow.complete();
2200
2201    validateLength( flow, 5, null );
2202
2203    List<Tuple> values = getSinkAsList( flow );
2204
2205    assertTrue( values.contains( new Tuple( "1\ta\t1\ta" ) ) );
2206    assertTrue( values.contains( new Tuple( "2\tb\t2\tb" ) ) );
2207    assertTrue( values.contains( new Tuple( "3\tc\t3\tc" ) ) );
2208    assertTrue( values.contains( new Tuple( "4\td\t4\td" ) ) );
2209    assertTrue( values.contains( new Tuple( "5\te\t5\te" ) ) );
2210    }
2211
2212  @Test
2213  public void testGroupBySplitSplitGroupByJoin() throws Exception
2214    {
2215    getPlatform().copyFromLocal( inputFileLower );
2216
2217    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
2218
2219    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink" ), SinkMode.REPLACE );
2220
2221    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
2222
2223    Pipe pipeFirst = new Pipe( "first" );
2224    pipeFirst = new Each( pipeFirst, new Fields( "line" ), splitter );
2225    pipeFirst = new GroupBy( pipeFirst, new Fields( "num" ) );
2226    pipeFirst = new Every( pipeFirst, new Fields( "char" ), new First( new Fields( "firstFirst" ) ), Fields.ALL );
2227
2228    Pipe pipeSecond = new Pipe( "second", pipeFirst );
2229    pipeSecond = new Each( pipeSecond, new Identity() );
2230    pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) );
2231    pipeSecond = new Every( pipeSecond, new Fields( "firstFirst" ), new First( new Fields( "secondFirst" ) ), Fields.ALL );
2232
2233    Pipe splice = new HashJoin( pipeFirst, new Fields( "num" ), pipeSecond, new Fields( "num" ), Fields.size( 4 ) );
2234//    Pipe splice = new HashJoin( pipeSecond, new Fields( "num" ), pipeFirst, new Fields( "num" ), Fields.size( 4 ) );
2235
2236    splice = new HashJoin( splice, new Fields( 0 ), pipeSecond, new Fields( "num" ), Fields.size( 6 ) );
2237
2238    Flow flow = getPlatform().getFlowConnector().connect( source, sink, splice );
2239
2240    flow.complete();
2241
2242    validateLength( flow, 5, null );
2243
2244    List<Tuple> values = getSinkAsList( flow );
2245
2246    assertTrue( values.contains( new Tuple( "1\ta\t1\ta\t1\ta" ) ) );
2247    assertTrue( values.contains( new Tuple( "2\tb\t2\tb\t2\tb" ) ) );
2248    assertTrue( values.contains( new Tuple( "3\tc\t3\tc\t3\tc" ) ) );
2249    assertTrue( values.contains( new Tuple( "4\td\t4\td\t4\td" ) ) );
2250    assertTrue( values.contains( new Tuple( "5\te\t5\te\t5\te" ) ) );
2251    }
2252
2253  @Test
2254  public void testGroupBySplitAroundSplitGroupByJoin() throws Exception
2255    {
2256    getPlatform().copyFromLocal( inputFileLower );
2257
2258    Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
2259
2260    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink" ), SinkMode.REPLACE );
2261    Tap sink2 = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "sink2" ), SinkMode.REPLACE );
2262
2263    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
2264
2265    Pipe pipeInit = new Pipe( "init" );
2266    Pipe pipeFirst = new Pipe( "first", pipeInit );
2267    pipeFirst = new Each( pipeFirst, new Fields( "line" ), splitter );
2268    pipeFirst = new GroupBy( pipeFirst, new Fields( "num" ) );
2269    pipeFirst = new Every( pipeFirst, new Fields( "char" ), new First( new Fields( "firstFirst" ) ), Fields.ALL );
2270
2271    Pipe sink2Pipe = new Pipe( "sink2", pipeFirst );
2272
2273    Pipe pipeSecond = new Pipe( "second", pipeInit );
2274    pipeSecond = new Each( pipeSecond, new Fields( "line" ), splitter );
2275    pipeSecond = new GroupBy( pipeSecond, new Fields( "num" ) );
2276    pipeSecond = new Every( pipeSecond, new Fields( "char" ), new First( new Fields( "secondFirst" ) ), Fields.ALL );
2277
2278//    Pipe splice = new HashJoin( pipeFirst, new Fields( "num" ), pipeSecond, new Fields( "num" ), Fields.size( 4 ) );
2279    Pipe splice = new HashJoin( pipeSecond, new Fields( "num" ), pipeFirst, new Fields( "num" ), Fields.size( 4 ) );
2280
2281    Pipe pipeThird = new Pipe( "third", pipeSecond );
2282    pipeThird = new Each( pipeThird, new Identity() );
2283    pipeThird = new GroupBy( pipeThird, new Fields( "num" ) );
2284    pipeThird = new Every( pipeThird, new Fields( "secondFirst" ), new First( new Fields( "thirdFirst" ) ), Fields.ALL );
2285
2286    splice = new HashJoin( splice, new Fields( 0 ), pipeThird, new Fields( "num" ), Fields.size( 6 ) );
2287
2288    FlowDef flowDef = FlowDef.flowDef()
2289      .setName( splice.getName() )
2290      .addSource( "init", source )
2291      .addTailSink( splice, sink )
2292      .addTailSink( sink2Pipe, sink2 );
2293
2294    Flow flow = getPlatform().getFlowConnector().connect( flowDef );
2295
2296    flow.complete();
2297
2298    validateLength( flow, 5, null );
2299
2300    List<Tuple> values = getSinkAsList( flow );
2301
2302    assertTrue( values.contains( new Tuple( "1\ta\t1\ta\t1\ta" ) ) );
2303    assertTrue( values.contains( new Tuple( "2\tb\t2\tb\t2\tb" ) ) );
2304    assertTrue( values.contains( new Tuple( "3\tc\t3\tc\t3\tc" ) ) );
2305    assertTrue( values.contains( new Tuple( "4\td\t4\td\t4\td" ) ) );
2306    assertTrue( values.contains( new Tuple( "5\te\t5\te\t5\te" ) ) );
2307    }
2308
2309  /**
2310   * This test checks for a deadlock when the same input is forked, adapted on one edge, then hashjoined back together.
2311   *
2312   * @throws Exception
2313   */
2314  @Test
2315  public void testForkThenJoin() throws Exception
2316    {
2317    getPlatform().copyFromLocal( inputFileLower );
2318    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
2319
2320    Map sources = new HashMap();
2321
2322    sources.put( "lower", sourceLower );
2323
2324    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE );
2325
2326    Function splitter = new RegexSplitter( new Fields( "num", "text" ), " " );
2327
2328    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
2329    Pipe pipeUpper = new Each( new Pipe( "upper", pipeLower ), new Fields( "text" ),
2330      new ExpressionFunction( Fields.ARGS, "text.toUpperCase(java.util.Locale.ROOT)", String.class ),
2331      Fields.REPLACE );
2332
2333    Pipe splice = new HashJoin( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
2334
2335    Map<Object, Object> properties = getProperties();
2336
2337    Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
2338
2339    flow.complete();
2340
2341    validateLength( flow, 5 );
2342
2343    List<Tuple> values = getSinkAsList( flow );
2344
2345    assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) );
2346    assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) );
2347    }
2348
2349  /**
2350   * This test checks for a deadlock when the same input is forked, adapted on one edge, then hashjoined back together.
2351   *
2352   * @throws Exception
2353   */
2354  @Test
2355  public void testForkCoGroupThenHashJoin() throws Exception
2356    {
2357    getPlatform().copyFromLocal( inputFileLower );
2358    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
2359    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
2360
2361    Map sources = new HashMap();
2362
2363    sources.put( "sourceLower", sourceLower );
2364    sources.put( "sourceUpper", sourceUpper );
2365
2366    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE );
2367
2368    Function splitter = new RegexSplitter( new Fields( "num", "text" ), " " );
2369
2370    Pipe leftPipeLower = new Each( new Pipe( "sourceLower" ), new Fields( "line" ), splitter );
2371    Pipe rightPipeUpper = new Each( new Pipe( "sourceUpper" ), new Fields( "line" ), splitter );
2372
2373    Pipe leftPipeUpper = new Each( new Pipe( "leftUpper", leftPipeLower ), new Fields( "text" ),
2374      new ExpressionFunction( Fields.ARGS, "text.toUpperCase(java.util.Locale.ROOT)", String.class ),
2375      Fields.REPLACE );
2376    Pipe rightPipeLower = new Each( new Pipe( "rightLower", rightPipeUpper ), new Fields( "text" ),
2377      new ExpressionFunction( Fields.ARGS, "text.toLowerCase(java.util.Locale.ROOT)", String.class ),
2378      Fields.REPLACE );
2379
2380    leftPipeUpper = new GroupBy( leftPipeUpper, new Fields( "num" ) );
2381    rightPipeLower = new GroupBy( rightPipeLower, new Fields( "num" ) );
2382
2383    Pipe middleSplice = new CoGroup( "middleCoGroup", leftPipeUpper, new Fields( "num" ), rightPipeLower, new Fields( "num" ), new Fields( "numM1", "charM1", "numM2", "charM2" ) );
2384
2385    Pipe leftSplice = new HashJoin( leftPipeLower, new Fields( "num" ), middleSplice, new Fields( "numM1" ) );
2386
2387    Map<Object, Object> properties = getProperties();
2388
2389    Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, leftSplice );
2390
2391    flow.complete();
2392
2393    validateLength( flow, 5 );
2394
2395    List<Tuple> values = getSinkAsList( flow );
2396    // that the flow completes at all is already success.
2397    assertTrue( values.contains( new Tuple( "1\ta\t1\tA\t1\ta" ) ) );
2398    assertTrue( values.contains( new Tuple( "2\tb\t2\tB\t2\tb" ) ) );
2399    }
2400
2401  /**
2402   * This test checks for a deadlock when the same input is forked, adapted on one edge, cogroup with something,
2403   * then hashjoined back together.
2404   *
2405   * @throws Exception
2406   */
2407  @Test
2408  public void testForkCoGroupThenHashJoinCoGroupAgain() throws Exception
2409    {
2410    getPlatform().copyFromLocal( inputFileLower );
2411    getPlatform().copyFromLocal( inputFileUpper );
2412
2413    Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower );
2414    Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper );
2415
2416    Map sources = new HashMap();
2417
2418    sources.put( "sourceLower", sourceLower );
2419    sources.put( "sourceUpper", sourceUpper );
2420
2421    Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "join" ), SinkMode.REPLACE );
2422
2423    Function splitter = new RegexSplitter( new Fields( "num", "text" ), " " );
2424
2425    Pipe leftPipeLower = new Each( new Pipe( "sourceLower" ), new Fields( "line" ), splitter );
2426    Pipe rightPipeUpper = new Each( new Pipe( "sourceUpper" ), new Fields( "line" ), splitter );
2427
2428    Pipe leftPipeUpper = new Each( new Pipe( "leftUpper", leftPipeLower ), new Fields( "text" ),
2429      new ExpressionFunction( Fields.ARGS, "text.toUpperCase(java.util.Locale.ROOT)", String.class ),
2430      Fields.REPLACE );
2431    Pipe rightPipeLower = new Each( new Pipe( "rightLower", rightPipeUpper ), new Fields( "text" ),
2432      new ExpressionFunction( Fields.ARGS, "text.toLowerCase(java.util.Locale.ROOT)", String.class ),
2433      Fields.REPLACE );
2434
2435    leftPipeUpper = new GroupBy( leftPipeUpper, new Fields( "num" ) );
2436    rightPipeLower = new GroupBy( rightPipeLower, new Fields( "num" ) );
2437
2438    Pipe middleSplice = new CoGroup( "middleCoGroup", leftPipeUpper, new Fields( "num" ), rightPipeLower, new Fields( "num" ), new Fields( "numM1", "charM1", "numM2", "charM2" ) );
2439
2440    Pipe leftSplice = new HashJoin( leftPipeLower, new Fields( "num" ), middleSplice, new Fields( "numM1" ) );
2441    Pipe rightSplice = new HashJoin( rightPipeUpper, new Fields( "num" ), middleSplice, new Fields( "numM2" ) );
2442
2443    leftSplice = new Rename( leftSplice, new Fields( "num", "text", "numM1", "charM1", "numM2", "charM2" ), new Fields( "numL1", "charL1", "numM1L", "charM1L", "numM2L", "charM2L" ) );
2444    rightSplice = new Rename( rightSplice, new Fields( "num", "text", "numM1", "charM1", "numM2", "charM2" ), new Fields( "numR1", "charR1", "numM1R", "charM1R", "numM2R", "charM2R" ) );
2445
2446    leftSplice = new GroupBy( leftSplice, new Fields( "numM1L" ) );
2447    rightSplice = new GroupBy( rightSplice, new Fields( "numM2R" ) );
2448
2449    Pipe splice = new CoGroup( "cogrouping", leftSplice, new Fields( "numM1L" ), rightSplice, new Fields( "numM2R" ) );
2450
2451    Map<Object, Object> properties = getProperties();
2452
2453    Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice );
2454
2455    flow.complete();
2456
2457    validateLength( flow, 5 );
2458
2459    List<Tuple> values = getSinkAsList( flow );
2460
2461    // getting this far is a success already (past old deadlocks)
2462    assertTrue( values.contains( new Tuple( "1\ta\t1\tA\t1\ta\t1\tA\t1\tA\t1\ta" ) ) );
2463    assertTrue( values.contains( new Tuple( "2\tb\t2\tB\t2\tb\t2\tB\t2\tB\t2\tb" ) ) );
2464    }
2465  }