001/* 002# Licensed Materials - Property of IBM 003# Copyright IBM Corp. 2015 004 */ 005package parallel; 006 007import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher; 008import static com.ibm.streamsx.topology.file.FileStreams.textFileReader; 009 010import java.io.ObjectStreamException; 011import java.util.logging.Logger; 012import java.util.regex.Matcher; 013import java.util.regex.Pattern; 014 015import com.ibm.streamsx.topology.TStream; 016import com.ibm.streamsx.topology.Topology; 017import com.ibm.streamsx.topology.context.StreamsContextFactory; 018import com.ibm.streamsx.topology.function.Function; 019import com.ibm.streamsx.topology.function.Predicate; 020 021public class ParallelRegexGrep { 022 static final Logger trace = Logger.getLogger("samples"); 023 024 @SuppressWarnings("serial") 025 public static void main(String[] args) throws Exception { 026 String contextType = args[0]; 027 String directory = args[1]; 028 final Pattern pattern = Pattern.compile(args[2]); 029 030 // Define the topology 031 Topology topology = new Topology("ParallelRegexGrep"); 032 033 // All streams with tuples that are Java String objects 034 TStream<String> files = directoryWatcher(topology, directory); 035 036 // Create a stream of lines from each file. 037 TStream<String> lines = textFileReader(files); 038 039 // Count the total number of lines before they are split between 040 // different parallel channels. 041 TStream<String> lines_counter = lines.transform( 042 new Function<String, String>() { 043 044 private int numSentStrings = 0; 045 046 @Override 047 public String apply(String v1) { 048 trace.info("Have sent " + (++numSentStrings) 049 + "to be filtered."); 050 return v1; 051 } 052 053 }); 054 055 // Parallelize the Stream. 056 // Since there are 5 channels of the stream, the approximate number of 057 // lines sent to each channel should be numSentStrings/5. This can be 058 // verified by comparing the outputs of the lines_counter stream to that 059 // of the parallel channels. 060 TStream<String> lines_parallel = lines_counter.parallel(5); 061 062 // Filter for the matched string, and print the number strings that have 063 // been tested. This is happening in parallel. 064 TStream<String> filtered_parallel = lines_parallel 065 .filter(new Predicate<String>() { 066 067 private int numReceivedStrings = 0; 068 069 @Override 070 public boolean test(String v1) { 071 trace.info("Have received " + (++numReceivedStrings) 072 + "strings on this parallel channel."); 073 // Pass the line through if it matches the 074 // regular expression pattern 075 return matcher.reset(v1).matches(); 076 } 077 078 transient Matcher matcher; 079 080 private Object readResolve() throws ObjectStreamException { 081 matcher = pattern.matcher(""); 082 return this; 083 } 084 }); 085 086 // Join the results of each parallel filter into one stream, 087 // merging the parallel streams back into one stream. 088 TStream<String> filtered_condensed = filtered_parallel.endParallel(); 089 090 // Print the combined results 091 filtered_condensed.print(); 092 093 // Execute the topology 094 StreamsContextFactory.getStreamsContext(contextType).submit(topology); 095 } 096}