How To Steal MP3s From The Internet

I was working out in the gym when I heard two guys talking about downloading music from the Internet, so they could listen to it on their mp3 players while working out. They were actually talking about paying for music, so I told them they could find mp3s for free on the Internet. One of the guys was interested in how you could find mp3 files, and I told him they could be searched for with a program like SearchCrawler. I describe how to search html files on the Internet for strings in my program, Password Protection Program. SearchCrawler goes a step further and searches for anchor tags, and also searches the linked pages; the linked pages are searched for anchor tags, etc.  I added another search to that program, so now it also searches for web addresses that end in ".mp3"; i.e., mp3 files. When an mp3 file is found it's written to an html file called mp3.html. When you're done searching for mp3 files you can play them by loading mp3.html into a browser and clicking on the mp3 links. 

This is a first draft program because it only finds complete web addresses. A complete program would search for any file that ends with ".mp3", determine if it's a complete web address and complete it if it wasn't. E.g., say SearchCrawler found a file called number9.mp3, it would have to determine it's current web address (no big deal if you store the name of the last link looked at) and prepend it to the name of the mp3 file, like http://www.beatlesfans/number9.mp3  As it currently stands, I can still find a lot of mp3 files, since I just ran the program and got 1200 links from the 52 sites it crawled. You can view that page by clicking here. Here's the code for the current program (my modifications are surrounded with // DMK):

Return To Blog Links

import java.awt.*;
import java.awt.event.*;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import javax.swing.*;
import javax.swing.table.*;

// The Search Web Crawler
public class SearchMP3 extends JFrame
{
    //comments that start with "DMK 02/02/2012" are for batch execution
    boolean limitSearching = false;
    boolean GotAMatch = false;
    boolean WroteAnAddress = false;
    String searchName = null;   //passed as args[o]  DMK 02/02/2012
    String searchURL = null;    //passed as args[1]   DMK 02/02/2012
    // DMK
    PrintStream mp3File;   //file to save mp3 addresses
    PrintStream IAmRunning = null;   //DMK 02/02/2012  my hack for synchronizing batch jobs   
    int mailCounter = 0;
    int crawlCntr = 0;
    // DMK

    // Max URLs drop down values.
    private static final String[] MAX_URLS =
    {"50", "100", "500", "1000", "10000", "100000"};
    
    // Cache of robot disallow lists.
    private HashMap disallowListCache = new HashMap();
    
    // Search GUI controls.
    private JTextField startTextField;
    private JComboBox maxComboBox;
    private JCheckBox limitCheckBox;
    private JTextField logTextField;
    private JTextField searchTextField;
    private JCheckBox caseCheckBox;
    private JButton searchButton;
    
    // Search stats GUI controls.
    private JLabel crawlingLabel2;
    private JLabel crawledLabel2;
    private JLabel toCrawlLabel2;
    private JProgressBar progressBar;
    private JLabel matchesLabel2;
    
    // Table listing search matches.
    private JTable table;
    
    // Flag for whether or not crawling is underway.
    private boolean crawling;
    
    // Matches log file print writer.
    private PrintWriter logFileWriter;
    
    // Constructor for Search Web Crawler.
    public SearchMP3(String nameStr, String URLStr, boolean limit)
    {
       try
       {  // DMK
          mp3File = new PrintStream("mp3.html"); 
          mp3File.println("");
          mp3File.println("   ");
          mp3File.println("      Dave's Music Links");
          mp3File.println("   ");
          mp3File.println("   ");
          if (limit)
          {
             limitSearching = true;
          }
          else
          {
             limitSearching = false;
          }
          searchName = nameStr;
          searchURL = URLStr;
          //mp3File = new PrintStream(searchName);
          IAmRunning = new PrintStream("IAmRunning"); //DMK 02/02/2012  my hack for synchronizing batch jobs
          IAmRunning.close();
       }
       catch (IOException e)
       {
          System.out.println("Unable to open mp3 log");
       }
        // Set application title.
        setTitle("Search Crawler");
        
        // Set window size.
        setSize(600, 600);
        
        // Handle window closing events.
        addWindowListener(new WindowAdapter() {
            public void windowClosing(WindowEvent e) {
                actionExit();
            }
        });
        
        // Set up file menu.
        JMenuBar menuBar = new JMenuBar();
        JMenu fileMenu = new JMenu("File");
        fileMenu.setMnemonic(KeyEvent.VK_F);
        JMenuItem fileExitMenuItem = new JMenuItem("Exit",
                KeyEvent.VK_X);
        fileExitMenuItem.addActionListener(new ActionListener() {
            public void actionPerformed(ActionEvent e) {
                actionExit();
            }
        });
        fileMenu.add(fileExitMenuItem);
        menuBar.add(fileMenu);
        setJMenuBar(menuBar);
        
        // Set up search panel.
        JPanel searchPanel = new JPanel();
        GridBagConstraints constraints;
        GridBagLayout layout = new GridBagLayout();
        searchPanel.setLayout(layout);
        
        JLabel startLabel = new JLabel("Start URL:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(startLabel, constraints);
        searchPanel.add(startLabel);
        
        startTextField = new JTextField();
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 0, 5);
        layout.setConstraints(startTextField, constraints);
        searchPanel.add(startTextField);
        
        JLabel maxLabel = new JLabel("Max URLs to Crawl:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(maxLabel, constraints);
        searchPanel.add(maxLabel);
        
        maxComboBox = new JComboBox(MAX_URLS);
        maxComboBox.setEditable(true);
        constraints = new GridBagConstraints();
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(maxComboBox, constraints);
        searchPanel.add(maxComboBox);
        
        limitCheckBox =
                new JCheckBox("Limit crawling to Start URL site");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.WEST;
        constraints.insets = new Insets(0, 10, 0, 0);
        layout.setConstraints(limitCheckBox, constraints);
        searchPanel.add(limitCheckBox);
        if (limitSearching)                                  //DMK 02/05/2012 - For running via command line
        {                                                    //allows us to limit search to a single site
           limitCheckBox.setSelected(true);
        }                                                    //useful for job searching
        
        JLabel blankLabel = new JLabel();
        constraints = new GridBagConstraints();
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        layout.setConstraints(blankLabel, constraints);
        searchPanel.add(blankLabel);
        
        JLabel logLabel = new JLabel("Matches Log File:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(logLabel, constraints);
        searchPanel.add(logLabel);
        
        String file =
                System.getProperty("user.dir") +
                System.getProperty("file.separator") +
                "crawler.log";
        logTextField = new JTextField(file);
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 0, 5);
        layout.setConstraints(logTextField, constraints);
        searchPanel.add(logTextField);
        
        JLabel searchLabel = new JLabel("Search String:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(searchLabel, constraints);
        searchPanel.add(searchLabel);
        
        searchTextField = new JTextField();
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.insets = new Insets(5, 5, 0, 0);
        constraints.gridwidth= 2;
        constraints.weightx = 1.0d;
        layout.setConstraints(searchTextField, constraints);
        searchPanel.add(searchTextField);
        
        caseCheckBox = new JCheckBox("Case Sensitive");
        constraints = new GridBagConstraints();
        constraints.insets = new Insets(5, 5, 0, 5);
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        layout.setConstraints(caseCheckBox, constraints);
        searchPanel.add(caseCheckBox);
        
        searchButton = new JButton("Search");
        searchButton.addActionListener(new ActionListener() {
            public void actionPerformed(ActionEvent e) {
                actionSearch();
            }
        });
        constraints = new GridBagConstraints();
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 5, 5);
        layout.setConstraints(searchButton, constraints);
        searchPanel.add(searchButton);
        
        JSeparator separator = new JSeparator();
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 5, 5);
        layout.setConstraints(separator, constraints);
        searchPanel.add(separator);
        
        JLabel crawlingLabel1 = new JLabel("Crawling:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(crawlingLabel1, constraints);
        searchPanel.add(crawlingLabel1);
        
        crawlingLabel2 = new JLabel();
        crawlingLabel2.setFont(
                crawlingLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 0, 5);
        layout.setConstraints(crawlingLabel2, constraints);
        searchPanel.add(crawlingLabel2);
        
        JLabel crawledLabel1 = new JLabel("Crawled URLs:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(crawledLabel1, constraints);
        searchPanel.add(crawledLabel1);
        
        crawledLabel2 = new JLabel();
        crawledLabel2.setFont(
                crawledLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 0, 5);
        layout.setConstraints(crawledLabel2, constraints);
        searchPanel.add(crawledLabel2);
        
        JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(toCrawlLabel1, constraints);
        searchPanel.add(toCrawlLabel1);
        
        toCrawlLabel2 = new JLabel();
        toCrawlLabel2.setFont(
                toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 0, 5);
        layout.setConstraints(toCrawlLabel2, constraints);
        searchPanel.add(toCrawlLabel2);
        
        JLabel progressLabel = new JLabel("Crawling Progress:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 0, 0);
        layout.setConstraints(progressLabel, constraints);
        searchPanel.add(progressLabel);
        
        progressBar = new JProgressBar();
        progressBar.setMinimum(0);
        progressBar.setStringPainted(true);
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 0, 5);
        layout.setConstraints(progressBar, constraints);
        searchPanel.add(progressBar);
        
        JLabel matchesLabel1 = new JLabel("Search Matches:");
        constraints = new GridBagConstraints();
        constraints.anchor = GridBagConstraints.EAST;
        constraints.insets = new Insets(5, 5, 10, 0);
        layout.setConstraints(matchesLabel1, constraints);
        searchPanel.add(matchesLabel1);
        
        matchesLabel2 = new JLabel();
        matchesLabel2.setFont(
                matchesLabel2.getFont().deriveFont(Font.PLAIN));
        constraints = new GridBagConstraints();
        constraints.fill = GridBagConstraints.HORIZONTAL;
        constraints.gridwidth = GridBagConstraints.REMAINDER;
        constraints.insets = new Insets(5, 5, 10, 5);
        layout.setConstraints(matchesLabel2, constraints);
        searchPanel.add(matchesLabel2);
        
        // Set up matches table.
        table =
                new JTable(new DefaultTableModel(new Object[][]{},
                new String[]{"URL"}) {
            public boolean isCellEditable(int row, int column) {
                return false;
            }
        });
        
        // Set up matches panel.
        JPanel matchesPanel = new JPanel();
        matchesPanel.setBorder(
                BorderFactory.createTitledBorder("Matches"));
        matchesPanel.setLayout(new BorderLayout());
        matchesPanel.add(new JScrollPane(table),
                BorderLayout.CENTER);
        
        // Add panels to display.
        getContentPane().setLayout(new BorderLayout());
        getContentPane().add(searchPanel, BorderLayout.NORTH);
        getContentPane().add(matchesPanel, BorderLayout.CENTER);
        
        //DMK 02/02/2012
        //We're at the end of the constructor, so we are ready to start
        actionSearch();
    }
    
    // Exit this program.
    private void actionExit() {
        //DMK   
        mp3File.print("   ");
        mp3File.print("");    
        mp3File.close();
        File Runner = new File("IAmRunning");
        boolean worked = false;
        worked = Runner.delete(); //DMK 02/02/2012  my hack to synchronize batch files
        if (worked)
        {
            System.out.println("Finished searching for " + searchName);
        }
        else
        {
            System.out.println("Failed to delete file: IAmRunning");
        }
        // DMK
        System.gc();
        for (int anInt = 0; anInt < 32000; anInt++);
        
        System.exit(0);
    }
    
    // Handle search/stop button being clicked.
    private void actionSearch() {
        // If stop button clicked, turn crawling flag off.
        if (crawling) 
        {
            crawling = false;
            // DMK write ending tags to mp3.html in actionExit()
            return;
        }
        
        ArrayList errorList = new ArrayList();
        
        // Validate that start URL has been entered.
        startTextField.setText(searchURL);
        String startUrl = startTextField.getText().trim();
        //String startUrl = searchURL;  DMK 02/02/2012
        if (startUrl.length() < 1) {
            errorList.add("Missing Start URL.");
        }
        // Verify start URL.
        else if (verifyUrl(startUrl) == null) {
            errorList.add("Invalid Start URL.");
        }
        
        // Validate that max URLs is either empty or is a number.
        //DMK 02/02/2012
        int maxUrls = 100000;
        // String max = ((String) maxComboBox.getSelectedItem()).trim();
        String max = "100000";
        /*
        if (max.length() > 0) 
        {
            try 
            {
                maxUrls = Integer.parseInt(max);
            }
            catch (NumberFormatException e) 
            {
            }
            if (maxUrls < 1) 
            {
                errorList.add("Invalid Max URLs value.");
            }            
            
        }
        */
        // Validate that matches log file has been entered.
        String logFile = logTextField.getText().trim();
        if (logFile.length() < 1) {
            errorList.add("Missing Matches Log File.");
        }
        
        // Validate that search string has been entered.
        //DMK 02/04/2012 splite up the name into first and last name
        searchName.trim();
        searchTextField.setText(searchName);
        String searchString = searchTextField.getText().trim();
        //String searchString = searchName;   DMK 02/02/2012
        if (searchString.length() < 1) {
            errorList.add("Missing Search String.");
        }
        
        // Show errors, if any, and return.
        if (errorList.size() > 0) {
            StringBuffer message = new StringBuffer();
            
            // Concatenate errors into single message.
            for (int i = 0; i < errorList.size(); i++) {
                message.append(errorList.get(i));
                if (i + 1 < errorList.size()) {
                    message.append("\n");
                }
            }
            
            showError(message.toString());
            return;
        }
        
        // Remove "www" from start URL if present.
        startUrl = removeWwwFromUrl(startUrl);
        
        // Start the search crawler.
        search(logFile, startUrl, maxUrls, searchString);
    }
    
    private void search(final String logFile, final String startUrl,
            final int maxUrls, final String searchString) {
        // Start the search in a new thread.
        Thread thread = new Thread(new Runnable() {
            public void run() {
                // Show hour glass cursor while crawling is under way.
                setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));
                
                // Disable search controls.
                startTextField.setEnabled(false);
                maxComboBox.setEnabled(false);
                limitCheckBox.setEnabled(false);
                logTextField.setEnabled(false);
                searchTextField.setEnabled(false);
                caseCheckBox.setEnabled(false);
                
                // Switch search button to "Stop."
                searchButton.setText("Stop");
                
                // Reset stats.
                table.setModel(new DefaultTableModel(new Object[][]{},
                        new String[]{"URL"}) {
                    public boolean isCellEditable(int row, int column) {
                        return false;
                    }
                });
                updateStats(startUrl, 0, 0, maxUrls);
                
                // Open matches log file.
                try {
                    logFileWriter = new PrintWriter(new FileWriter(logFile));
                } catch (Exception e) {
                    showError("Unable to open matches log file.");
                    return;
                }
                
                // Turn crawling flag on.
                crawling = true;
                
                // Perform the actual crawling.
                crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
                        searchString, caseCheckBox.isSelected());
                
                // Turn crawling flag off.
                crawling = false;
                
                // Close matches log file.
                try {
                    logFileWriter.close();
                } catch (Exception e) {
                    showError("Unable to close matches log file.");
                }
                
                // Mark search as done.
                crawlingLabel2.setText("Done");
                
                // Enable search controls.
                startTextField.setEnabled(true);
                maxComboBox.setEnabled(true);
                limitCheckBox.setEnabled(true);
                logTextField.setEnabled(true);
                searchTextField.setEnabled(true);
                caseCheckBox.setEnabled(true);
                
                // Switch search button back to "Search."
                searchButton.setText("Search");
                
                // Return to default cursor.
                setCursor(Cursor.getDefaultCursor());
                
                // Show message if search string not found.
                if (table.getRowCount() == 0) {
                    JOptionPane.showMessageDialog(SearchMP3.this,
                     "Your Search String was not found. Please try another.",
                     "Search String Not Found",
                     JOptionPane.WARNING_MESSAGE);
                }
            }
        });
        thread.start();
    }
    
    // Show dialog box with error message.
    private void showError(String message) {
        JOptionPane.showMessageDialog(this, message, "Error",
                JOptionPane.ERROR_MESSAGE);
    }
    
    // Update crawling stats.
    private void updateStats(
            String crawling, int crawled, int toCrawl, int maxUrls) {
        crawlingLabel2.setText(crawling);
        crawledLabel2.setText("" + crawled);
        toCrawlLabel2.setText("" + toCrawl);
        crawlCntr = crawled;
        
        // Update progress bar.
        if (maxUrls == -1) {
            progressBar.setMaximum(crawled + toCrawl);
        } else {
            progressBar.setMaximum(maxUrls);
        }
        progressBar.setValue(crawled);
        
        matchesLabel2.setText("" + table.getRowCount());
    }
    
    // Add match to matches table and log file.
    private void addMatch(String url) {
        // Add URL to matches table.
        DefaultTableModel model =
                (DefaultTableModel) table.getModel();
        model.addRow(new Object[]{url});       
        
        // Add URL to matches log file.
        try {
            logFileWriter.println(url);
        } catch (Exception e) {
            showError("Unable to log match.");
        }
    }
    
    // Verify URL format.
    private URL verifyUrl(String url) {
        // Only allow HTTP URLs.
        if (!url.toLowerCase().startsWith("http://"))
             return null;
        
        // Verify format of URL.
        URL verifiedUrl = null;
        try {
            verifiedUrl = new URL(url);
        } catch (Exception e) {
            return null;
        }
        
        return verifiedUrl;
    }
    
    // Check if robot is allowed to access the given URL.
    private boolean isRobotAllowed(URL urlToCheck) {
        String host = urlToCheck.getHost().toLowerCase();
        
        // Retrieve host's disallow list from cache.
        ArrayList disallowList =
                (ArrayList) disallowListCache.get(host);
        
        // If list is not in the cache, download and cache it.
        if (disallowList == null) {
            disallowList = new ArrayList();
            
            try {
                URL robotsFileUrl =
                        new URL("http://" + host + "/robots.txt");
                
                // Open connection to robot file URL for reading.
                BufferedReader reader =
                        new BufferedReader(new InputStreamReader(
                        robotsFileUrl.openStream()));
                
                // Read robot file, creating list of disallowed paths.
                String line;
                while ((line = reader.readLine()) != null) {
                    if (line.indexOf("Disallow:") == 0) {
                        String disallowPath =
                                line.substring("Disallow:".length());
                        
                        // Check disallow path for comments and 
                        // remove if present.
                        int commentIndex = disallowPath.indexOf("#");
                        if (commentIndex != - 1) {
                            disallowPath =
                                    disallowPath.substring(0, commentIndex);
                        }
                        
                        // Remove leading or trailing spaces from 
                        // disallow path.
                        disallowPath = disallowPath.trim();
                        
                        // Add disallow path to list.
                        disallowList.add(disallowPath);
                    }
                }
                
                // Add new disallow list to cache.
                disallowListCache.put(host, disallowList);
            } catch (Exception e) {
        /* Assume robot is allowed since an exception
           is thrown if the robot file doesn't exist. */
                return true;
            }
        }
        
    /* Loop through disallow list to see if the
       crawling is allowed for the given URL. */
        String file = urlToCheck.getFile();
        for (int i = 0; i < disallowList.size(); i++) {
            String disallow = (String) disallowList.get(i);
            if (file.startsWith(disallow)) {
                // return false;
                return true;  //DMK - robots 
            }
        }
        
        return true;
    }
    
    // Download page at given URL.
    private String downloadPage(URL pageUrl) {
        try {
            // Open connection to URL for reading.
            BufferedReader reader =
                    new BufferedReader(new InputStreamReader(
                    pageUrl.openStream()));
            
            // Read page into buffer.
            String line;
            StringBuffer pageBuffer = new StringBuffer();
            while ((line = reader.readLine()) != null) {
                pageBuffer.append(line);
            }
            
            return pageBuffer.toString();
        } catch (Exception e) {
        }
        
        return null;
    }
    
    // Remove leading "www" from a URL's host if present.
    private String removeWwwFromUrl(String url) {
        int index = url.indexOf("://www.");
        if (index != -1) {
            return url.substring(0, index + 3) +
                    url.substring(index + 7);
        }
        
        return (url);
    }
    
    // Parse through page contents and retrieve links.
    private ArrayList retrieveLinks(
            URL pageUrl, String pageContents, HashSet crawledList,
            boolean limitHost) {
        // Compile link matching pattern.
        Pattern p =
                Pattern.compile("]",
                Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(pageContents);
        
        // Create list of link matches.
        ArrayList linkList = new ArrayList();
        while (m.find()) {
            String link = m.group(1).trim();
            //DMK
            try
            {
               if (link.length() < 1) link = "badurl";
            }
            catch(NullPointerException e)
            {
               link = "badurl";
            }

            // Prefix absolute and relative URLs if necessary.
            if (link.indexOf("://") == -1) {
                // Handle absolute URLs.
                if (link.charAt(0) == '/') {
                    link = "http://" + pageUrl.getHost() + link;
                    // Handle relative URLs.
                } else {
                    String file = pageUrl.getFile();
                    if (file.indexOf('/') == -1) {
                        link = "http://" + pageUrl.getHost() + "/" + link;
                    } else {
                        String path =
                                file.substring(0, file.lastIndexOf('/') + 1);
                        link = "http://" + pageUrl.getHost() + path + link;
                    }
                }
            }
            
            // Remove anchors from link.
            int index = link.indexOf('#');
            if (index != -1) {
                link = link.substring(0, index);
            }
            
            // Remove leading "www" from URL's host if present.
            link = removeWwwFromUrl(link);
            
            // Verify link and skip if invalid.
            URL verifiedLink = verifyUrl(link);
            if (verifiedLink == null) {
                continue;
            }
            
      /* If specified, limit links to those
         having the same host as the start URL. */
            if (limitHost &&
                    !pageUrl.getHost().toLowerCase().equals(
                    verifiedLink.getHost().toLowerCase())) {
                continue;
            }
            
            // Skip link if it has already been crawled.
            if (crawledList.contains(link)) {
                continue;
            }
            
            // Add link to list.
            linkList.add(link);
        }
        
        return (linkList);
    }
    
  /* Write mp3 Addresses if there are any */
    private void writemp3(String url)
    {                 
           if (GotAMatch) //DMK 05/11/2015, I was searching through page contents at this point and using a regular expression for web addresses
           {	          //ending in mp3 and/or wav. Just sending url value now, and added ogg vorbis as an acceptable file suffix.       
                 WroteAnAddress = true;     
                 if (url.endsWith(".mp3") || url.endsWith(".MP3") || url.endsWith(".wav") || url.endsWith(".WAV") || url.endsWith(".ogg") || url.endsWith(".OGG"))				 
                  try
                  {   
				    System.out.println(url);	   
                    mp3File.print("");  
                    mp3File.print(url);
                    mp3File.print("");
                    mp3File.println("
"); mailCounter++; //DMK 02/02/2012 try { java.lang.Thread.sleep(1); } catch(InterruptedException e) { System.out.println("Thread api failed\n"); } } catch (Exception e) { System.out.println("Failed to print line of html\n"); } } GotAMatch = false; System.out.println("We now have " + mailCounter + " mp3 addresses"); } // Perform the actual crawling, searching for the search string. public void crawl( String startUrl, int maxUrls, boolean limitHost, String searchString, boolean caseSensitive) { // Setup crawl lists. HashSet crawledList = new HashSet(); LinkedHashSet toCrawlList = new LinkedHashSet(); // Add start URL to the to crawl list. toCrawlList.add(startUrl); /* Perform actual crawling by looping through the to crawl list. */ while (crawling && toCrawlList.size() > 0) { /* Check to see if the max URL count has been reached, if it was specified.*/ if (maxUrls != -1) { if (crawledList.size() == maxUrls) { break; } } // Get URL at bottom of the list. String url = (String) toCrawlList.iterator().next(); // Remove URL from the to crawl list. toCrawlList.remove(url); // Convert string url to URL object. URL verifiedUrl = verifyUrl(url); // Skip URL if robots are not allowed to access it. if (!isRobotAllowed(verifiedUrl)) { continue; } // Update crawling stats. updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls); // Add page to the crawled list. crawledList.add(url); // Download the page at the given url. String pageContents = downloadPage(verifiedUrl); /* If the page was downloaded successfully, retrieve all of its links and then see if it contains the search string. */ if (pageContents != null && pageContents.length() > 0) { // Retrieve list of valid links from page. ArrayList links = retrieveLinks(verifiedUrl, pageContents, crawledList, limitHost); // Add links to the to crawl list. toCrawlList.addAll(links); /* Check if search string is present in page and if so record a match. */ String lowerPage = pageContents.toLowerCase(); String lowerSrch = searchString.toLowerCase(); if (lowerPage.indexOf(lowerSrch) > -1) { addMatch(url); GotAMatch = true; writemp3(url); //DMK 05/11/2015, I was originally passing pageContents to this routine //When I wrote this (7 years ago, I think)I must've been thinking that there were possible sound files that were being //missed, and had writemp3 search for them, before writing the sound files out. Changed this to passing just found url /* BufferedReader inputReader = new BufferedReader(new InputStreamReader(System.in)) ; String pauseData ; try { pauseData = inputReader.readLine(); } catch(IOException e) { System.out.println("Error reading keyboard input") ; } */ } } // Update crawling stats. updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls); } } // Run the Search Crawler. public static void main(String[] args) { SearchMP3 crawler; if ((args.length > 2) && (args[2].equals("limit"))) { crawler = new SearchMP3(args[0], args[1], true); } else { crawler = new SearchMP3(args[0], args[1], false); } crawler.show(); } }


Return To My Program Links