使用Selenium进行网页抓取,性能缓慢?

3
基本上,我正在尝试通过selenium获取网页的所有信息,但随着时间的推移,速度会变慢......到无法完成人员列表并超时的程度。
我知道随着时间的推移它应该变慢,因为我在保持已经记录过的id的HashSet,并且每次循环都检查这个id是否已经出现过。
我附上了一堆代码展示如何从网站中提取数据,但我怀疑主要问题不在于此。我认为我忽略了某些东西或者有某种资源泄漏或者selenium限制......
因此,如果我以安静模式启动webmanager,在120个循环后它不会出错,如果我以普通chromedriver启动它,则最终会出错并跳过人员......我假设这是因为当处理网页时我触碰了它或者其他原因。
除了所有其他问题之外,
- 你是否看到任何明显的资源泄漏? - 你知道为什么它最终会停止并变得如此缓慢而无法使用吗? - 是否有一些垃圾我没有处理? - 如何增加速度?
WebManager类:
public WebManager(string website)
    {

        driver = new ChromeDriver();
        driver.Navigate().GoToUrl(website);
    }

    public WebManager(Boolean quiet)
    {
        if (!quiet)
            driver = new ChromeDriver();
        else
        {
            var processInfo = new ProcessStartInfo("java.exe", "-jar quietserver.jar")
            {
                CreateNoWindow = true,
                UseShellExecute = false
            };
            quietServer = Process.Start(processInfo);
            driver = new RemoteWebDriver(DesiredCapabilities.HtmlUnit());
        }
    }

程序的主要流程:
public void doScrape()
    {
        int fileCount = Directory.GetDirectories(utils.savePath).Length;
        int startCounty = (fileCount == 0 ? 1 : fileCount);
        string lastOffenderId = null;

        if (fileCount > 4 && localScrape)
        {
            Console.WriteLine("Please clear storage folders...");
            Console.Read();
            Environment.Exit(1);
        }

        webManager = new WebManager(quiet);

        for (int i = (localScrape ? 0 : startCounty); i <= (localScrape ? 2 : 64); i++)
        {
            webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i)));
            HashSet<string> completedList = new HashSet<string>();
            string locationStr = webManager.getElementByxPath(countyxPath).Text;
            Console.WriteLine("Working on county: " + locationStr.Substring(locationStr.IndexOf(':') + 2));
            locationStr = locationStr.Substring(locationStr.IndexOf(':') + 2);
            for (int l = 2; l < 10000; l++)
            {
                try
                {
                    var element1 = new WebDriverWait(webManager.driver, TimeSpan.FromSeconds(5)).Until(ExpectedConditions.ElementExists((By.XPath(getOffenderxPath(l)))));
                    string linkToOffender = element1.GetAttribute("href");
                    string offenderId = linkToOffender.Substring(linkToOffender.IndexOf('=') + 1);
                    if (completedList.Contains(offenderId))
                    {
                        Console.WriteLine("Offender id " + offenderId + " has multiple aliases one of which is: " + element1.Text);
                        continue;
                    }
                    lastOffenderId = offenderId;
                    element1.Click();

                    var currentPlacement = webManager.getElementTextByxPath(currentPlacementxPath);
                    var lastName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 2));
                    var firstName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 3));
                    var middleName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 4));
                    var dob = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 5));
                    var sex = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 6));
                    var riskLevel = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 7));
                    var designation = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 8));
                    Console.WriteLine("Offender info: " + currentPlacement + " " + lastName + " " + firstName + " " + middleName + " " + dob + " " + sex + " " + designation);

                    var race = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 1));
                    var ethnicity = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 2));
                    var height = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 3));
                    var weight = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 4));
                    var hair = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 5));
                    var eyes = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 6));
                    var lenses = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 7));
                    var photodate = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 8));

                    var jurisdiction = webManager.getElementTextByxPath(jurisductionxPath);


                    // ------------ Logic for addresses ------------------------
                    Address[] addresses;
                    List<Address> addressList = new List<Address>();

                    for (int x = 1; x < 20; x++)
                    {
                        try
                        {
                            var address_1 = webManager.driver.FindElement(By.XPath(getOffenderAddress(x, 1)));
                            if (address_1 != null)
                            {
                                Address adds = new Address();
                                adds.type = webManager.getElementTextByxPath(getOffenderAddress(x, 1));
                                adds.county = webManager.getElementTextByxPath(getOffenderAddress(x, 2));
                                adds.location = webManager.getElementTextByxPath(getOffenderAddress(x, 3));
                                addressList.Add(adds);
                            }
                        }
                        catch (NoSuchElementException e1)
                        {
                            break;
                        }
                    }

                    Console.WriteLine(addressList.Count > 1 ? "Multiple addresses... listing" : "Only one address found");
                    foreach (Address aa in addressList)
                    {
                        Console.WriteLine(aa.ToString());
                    }

                    addresses = addressList.ToArray();
                    // --------------- end of address logic --------------------

                    //---------- Current Conviction logic -----------------------

                    Conviction currentConviction = new Conviction();
                    ConvictionDetails[] convictionDetails;
                    List<ConvictionDetails> currentConvictionDetails = new List<ConvictionDetails>();
                    for (int x = 1; x < 20; x++)
                    {
                        try
                        {
                            /*
                             * Not happy about this but it has to be done this way
                             * 
                             * Checks the span[1] to see if it is still a conviction or if
                             * it starts the list of information.
                             * 
                             * */
                            var spanTitle = webManager.driver.FindElement(By.XPath(getConvictionTitlexPath(x)));
                            if (spanTitle.Text.Contains("Date"))
                                break;
                            var title = webManager.driver.FindElement(By.XPath(getConvictionDetailsxPath(x, 1)));
                            var section = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 2));
                            var subsection = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 3));
                            var c_class = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 4));
                            var categlory = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 5));
                            var counts = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 6));
                            var desc = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 7));

                            ConvictionDetails cDetails = new ConvictionDetails();

                            cDetails.c_class = c_class;
                            cDetails.categlory = categlory;
                            cDetails.counts = counts;
                            cDetails.description = desc;
                            cDetails.section = section;
                            cDetails.title = title.Text;
                            cDetails.subsection = subsection;

                            currentConvictionDetails.Add(cDetails);
                        }
                        catch (NoSuchElementException e1)
                        {
                            break;
                        }
                    }

                    convictionDetails = currentConvictionDetails.ToArray();

                    var dateOfCrime = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 1));
                    var convictionDate = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 2));
                    var victiminfo = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 3));
                    var arrestingAgency = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 4));
                    var offenseDescription = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 5));
                    var relationship = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 6));
                    var weapon = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 7));
                    var force = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 8));
                    var computer = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 9));
                    var porn = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 10));
                    var sentance = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 11));

                    currentConviction.arrestingAgency = arrestingAgency;
                    currentConviction.computerUsed = computer;
                    currentConviction.convictionDate = convictionDate;
                    currentConviction.crimeDate = dateOfCrime;
                    currentConviction.forceUsed = force;
                    currentConviction.offenseDescription = offenseDescription;
                    currentConviction.pornInvolved = porn;
                    currentConviction.relationship = relationship;
                    currentConviction.sentance = sentance;
                    currentConviction.victimInfo = victiminfo;
                    currentConviction.weaponsUsed = weapon;
                    currentConviction.details = convictionDetails;

                    Console.WriteLine("-------Current Conviction --------");
                    Console.WriteLine(currentConviction.ToString());
                    //----------- End Current Conviction logic -------------------


                    //----------- Pervious Conviction logic ----------------------

                    Conviction[] previousConvictions = null;
                    int lastDiv = 0;
                    List<Conviction> previousConvictionsList = new List<Conviction>();
                    for (int x = 3; x < 10; x++)
                    {
                        List<ConvictionDetails> prevConvictionDetailsList = new List<ConvictionDetails>();
                        int last = 0;
                        try
                        {
                            for (int y = 1; y < 10; y++)
                            {
                                try
                                {
                                    var spanTitle = webManager.driver.FindElement(By.XPath(getListTitlexPathByDiv(x, y)));

                                    if (!spanTitle.Text.Contains("Title"))
                                        break;

                                    var title = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 1));
                                    var section = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 2));
                                    var subsection = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 3));
                                    var c_class = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 4));
                                    var categlory = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 5));
                                    var counts = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 6));
                                    var desc = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 7));

                                    ConvictionDetails prevconvictionDetails = new ConvictionDetails();

                                    prevconvictionDetails.c_class = c_class;
                                    prevconvictionDetails.categlory = categlory;
                                    prevconvictionDetails.counts = counts;
                                    prevconvictionDetails.description = desc;
                                    prevconvictionDetails.section = section;
                                    prevconvictionDetails.title = title;
                                    prevconvictionDetails.subsection = subsection;

                                    prevConvictionDetailsList.Add(prevconvictionDetails);
                                }
                                catch (NoSuchElementException)
                                {
                                    break;
                                }
                            }
                            // keeps track of the divisions but putting it here in stack will increase by 1 always will account for it later
                            lastDiv = x;
                            if (prevConvictionDetailsList.Count == last)
                                break;
                            last = prevConvictionDetailsList.Count;
                            webManager.driver.FindElement(By.XPath(getPreviousMoreInfoButton(x))).Click();

                            Thread.Sleep(1000);

                            var prevDateOfCrime = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 1));
                            var prevConvictionDate = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 2));
                            var prevVictiminfo = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 3));
                            var prevArrestingAgency = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 4));
                            var prevOffenseDescription = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 5));
                            var prevRelationship = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 6));
                            var prevWeapon = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 7));
                            var prevForce = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 8));
                            var prevComputer = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 9));
                            var prevPorn = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 10));
                            var prevSentance = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 11));

                            Conviction previousConviction = new Conviction();

                            previousConviction.arrestingAgency = prevArrestingAgency;
                            previousConviction.computerUsed = prevComputer;
                            previousConviction.convictionDate = prevConvictionDate;
                            previousConviction.crimeDate = prevDateOfCrime;
                            previousConviction.forceUsed = prevForce;
                            previousConviction.offenseDescription = prevOffenseDescription;
                            previousConviction.pornInvolved = prevPorn;
                            previousConviction.relationship = prevRelationship;
                            previousConviction.sentance = prevSentance;
                            previousConviction.victimInfo = prevVictiminfo;
                            previousConviction.weaponsUsed = prevWeapon;
                            previousConviction.details = prevConvictionDetailsList.ToArray();


                            previousConvictionsList.Add(previousConviction);
                        }
                        catch (NoSuchElementException)
                        {
                            break;
                        }
                    }

                    if (previousConvictionsList.Count > 0)
                    {
                        previousConvictions = previousConvictionsList.ToArray();

                        Console.WriteLine("-----Previous convictions------");
                        foreach (Conviction c in previousConvictions)
                        {
                            Console.WriteLine(c.ToString());
                        }
                    }

                    //-------------- End of Conviction logic ---------------------

                    //-----------------Beginning of Supervising until Scars--------
                    int adjustedParagraph = (previousConvictions == null ? 4 : 3);
                    var supervisingAgency = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph));
                    var specialConditions = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 1));
                    var maximumExpire = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 2));
                    //-----------------End of Supervising until Scars--------------


                    //---------------- Scars logic --------------------------------

                    /*
                     * At this point the last paragraph used was adjustedParagrpah + 2
                     * */
                    int lastParagraph = 0;
                    List<string> markingList = new List<string>();
                    String[] markings = null;
                    for (int x = (adjustedParagraph + 3); x < (adjustedParagraph + 13); x++)
                    {
                        var marking1 = webManager.getElementTextByxPath(getMainContentParagraph(x));
                        if (marking1.Contains("None"))
                        {
                            lastParagraph = x;
                            break;
                        }
                        var aliasHeadingEle = webManager.getElementByxPath(aliasHeadingxPath);
                        var webEle1 = webManager.getElementByxPath(getMainContentParagraph(x));
                        if (aliasHeadingEle.Location.Y > webEle1.Location.Y)
                        {
                            markingList.Add(webEle1.Text);
                            lastParagraph = x;
                        }
                        else
                            break;

                    }
                    markings = markingList.ToArray();


                    //------------------ End Scars logic -------------------------

                    //------------------------ Alias Logic --------------------
                    int lastParagraph2 = 0;
                    List<string> aliasList = new List<string>();
                    String[] aliases = null;
                    for (int x = (lastParagraph + 1); x < (lastParagraph + 10); x++)
                    {
                        try
                        {
                            var alias1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x)));
                            if (alias1.Text.Contains("None"))
                            {
                                lastParagraph2 = x;
                                break;
                            }
                        }
                        catch (NoSuchElementException)
                        {
                            break;
                        }

                        var currentVehicleHeading = webManager.driver.FindElement(By.XPath(currentVehiclexPath));
                        var webEle1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x)));
                        if (currentVehicleHeading.Location.Y > webEle1.Location.Y)
                        {
                            aliasList.Add(webEle1.Text);
                            Console.WriteLine("Offender has alias: " + webEle1.Text);
                            lastParagraph2 = x;
                        }
                        else
                            break;


                    }

                    aliases = aliasList.ToArray();

                    //------------------- End Alias logic -------------------

                    //--------------------- Vehicle Logic ---------------------------

                    /*
                     *  I feel bad about doing work in a catch... but for some reason I can't think
                     *  of a better way at the momment so I am just going with it. 
                     *  
                     *  If you can make this logic better please do so...
                     *  
                     * The lastdiv is already 1 more than last used due to placement read comments above
                     * 
                     * */

                    Vehicle[] vehicles = null; 
                    try
                    {
                        var vehicleElement = webManager.driver.FindElement(By.XPath(getVehiclePxPath(lastDiv)));
                    }
                    catch (NoSuchElementException)
                    {
                        List<Vehicle> vehicleList = new List<Vehicle>();
                        for (int x = 1; x < 10; x++)
                        {
                            try
                            {
                                var vehiclePlate = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 1)));
                                var vehicleState = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 2)));
                                var vehicleYear = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 3)));
                                var vehicleModel = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 4)));
                                var vehicleColor = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 5)));

                                Vehicle vehicle1 = new Vehicle();

                                vehicle1.color = vehicleColor.Text;
                                vehicle1.makeModel = vehicleModel.Text;
                                vehicle1.plate = vehiclePlate.Text;
                                vehicle1.state = vehicleState.Text;
                                vehicle1.year = vehicleYear.Text;

                                vehicleList.Add(vehicle1);
                            }
                            catch (NoSuchElementException)
                            {
                                break;
                            }
                        }
                        vehicles = vehicleList.ToArray();
                    }


                    //--------------------- End Vehicle Logic -------------------------


                    //-------- Creating & Adding fields into Offender Object----------
                    Offender offender = new Offender();

                    offender.currentPlacement = currentPlacement;
                    offender.designation = designation;
                    offender.ethnicity = ethnicity;
                    offender.dob = dob;
                    offender.eyeColor = eyes;
                    offender.hairColor = hair;
                    offender.CorrectiveLens = lenses;
                    offender.height = height;
                    offender.weight = weight;
                    offender.photoDate = photodate;
                    offender.offenderId = offenderId;
                    offender.riskLevel = riskLevel;
                    offender.race = race;
                    offender.sex = sex;
                    offender.lastName = lastName;
                    offender.firstName = firstName;
                    offender.middleName = middleName;
                    offender.address = addresses;
                    offender.jurisdiction = jurisdiction;
                    offender.currentConviction = currentConviction;
                    offender.perviousConvictions = previousConvictions;
                    offender.supervisingInfo = supervisingAgency;
                    offender.conditions = specialConditions;
                    offender.maximumDate = maximumExpire;
                    offender.markings = markings;
                    offender.aliases = aliases;
                    offender.currentVehicles = vehicles;
                    offender.linkToPic = getPhotoLink(offenderId);

                    offender.Save(utils.getSaveLocation(locationStr, offender.offenderId));
                    //------ add to completed offender id list --------
                    completedList.Add(offenderId);
                    webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i)));
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                    Console.WriteLine("Last offender id " + lastOffenderId);
                    break;
                }
            }
        }
        webManager.close();
    }
1个回答

1

最终转换到了一个名为HTMLAgilityPack的不同HTML解析器。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接