Monday, September 1, 2014

Re: ajax-crawling approach to make gwt app crawlable works on default gwt example, but not on my app

Hi all, 
I am also facing the same problem. I am stuck at the same place where my gwt app is not crawling even after applying all the code as described in https://developers.google.com/webmasters/ajax-crawling/.
Please help me with some code or link 

On Friday, March 30, 2012 10:01:56 PM UTC+5:30, erebrus wrote:
Hi all,
I was reading https://developers.google.com/webmasters/ajax-crawling/
on how to make ajax apps (consequently gwt apps) crawlable.
I took the code from google (summarized in point 3 of "How to create
an HTML snapshot?" to create a filter (that returns html from ajax
using HtmlUnit) and changed the web.xml accordingly. I created a new
GWT project with example code and applied the filter and the web.xml
there. It worked directly.
However, I did exactly the same on the gwt app I want to make
searchable and it doesn't work. For some reason, the only requests the
filter gets are the ones to the ones for the rpc.
I think I must be missing a terribly simple detail, but I'm a bit lost
on where to go from here.


Following you can see the code for the filter (CrawlServlet) and the
web.xml

package crawltest.server;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.logging.Logger;

import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

/**
 * Servlet that makes this application crawlable
 */
public final class CrawlServlet implements Filter {

        private static final Logger logger =
Logger.getLogger(CrawlServlet.class
                        .getName());
  private static String rewriteQueryString(String queryString) throws
UnsupportedEncodingException {
    StringBuilder queryStringSb = new StringBuilder(queryString);
    int i = queryStringSb.indexOf("&_escaped_fragment_");
    if (i != -1) {
      StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
      tmpSb.append("#!");
      tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 20,
queryStringSb.length()),"UTF-8"));
      queryStringSb = tmpSb;
    }

    i = queryStringSb.indexOf("_escaped_fragment_");
    if (i != -1) {
      StringBuilder tmpSb = new
StringBuilder(queryStringSb.substring(0, i));
      tmpSb.append("#!");
      tmpSb.append(URLDecoder.decode(queryStringSb.substring(i + 19,
queryStringSb.length()), "UTF-8"));
      queryStringSb = tmpSb;
    }
    if (queryStringSb.indexOf("#!") != 0) {
      queryStringSb.insert(0, '?');
    }
    queryString = queryStringSb.toString();



    return queryString;
  }

  private FilterConfig filterConfig = null;

  /**
   * Destroys the filter configuration
   */
  public void destroy() {
    this.filterConfig = null;
  }

  /**
   * Filters all requests and invokes headless browser if necessary
   */
  public void doFilter(ServletRequest request, ServletResponse
response,
      FilterChain chain) throws IOException {
          System.out.println("crawl");
    if (filterConfig == null) {
      return;
    }
    System.out.println("crawl");
    HttpServletRequest req = (HttpServletRequest) request;
    HttpServletResponse res = (HttpServletResponse) response;
    String queryString = req.getQueryString();
    System.out.println("query:"+queryString);
    System.out.println("param:"+req.getParameterMap().toString());
    System.out.println("req:"+req);
    if ((queryString != null) &&
(queryString.contains("_escaped_fragment_"))) {
            System.out.println("in!!");
      StringBuilder pageNameSb = new StringBuilder("http://");
      pageNameSb.append(req.getServerName());
      if (req.getServerPort() != 0) {
        pageNameSb.append(":");
        pageNameSb.append(req.getServerPort());
      }
      pageNameSb.append(req.getRequestURI());
      queryString = rewriteQueryString(queryString);
      pageNameSb.append(queryString);

      final WebClient webClient = new
WebClient(BrowserVersion.FIREFOX_3);
      webClient.setJavaScriptEnabled(true);
      String pageName = pageNameSb.toString();
      HtmlPage page = webClient.getPage(pageName);
      webClient.waitForBackgroundJavaScriptStartingBefore(2000);

      res.setContentType("text/html;charset=UTF-8");
      PrintWriter out = res.getWriter();
      out.println("<hr>");
      out.println("<center><h3>You are viewing a non-interactive page
that is intended for the crawler.  You probably want to see this page:
<a href=\""
          + pageName + "\">" + pageName + "</a></h3></center>");
      out.println("<hr>");

      out.println(page.asXml());
      webClient.closeAllWindows();
      out.close();

    } else {
      try {
        chain.doFilter(request, response);
      } catch (ServletException e) {
        e.printStackTrace();
      }
    }
  }

  /**
   * Initializes the filter configuration
   */
  public void init(FilterConfig filterConfig) {
    this.filterConfig = filterConfig;
  }

}


web-xml:

<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://java.sun.com/xml/ns/javaee
              http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"
        version="2.5" xmlns="http://java.sun.com/xml/ns/javaee">

        <filter>
                <filter-name>CrawlServlet</filter-name>
                <filter-class>crawltest.server.CrawlServlet</filter-class>
        </filter>

        <filter-mapping>
                <filter-name>CrawlServlet</filter-name>
                <url-pattern>/*</url-pattern>
        </filter-mapping>

        <!-- Servlets -->

        <!-- Default page to serve -->
        <welcome-file-list>
                <welcome-file>CrawlTest.html</welcome-file>
        </welcome-file-list>

</web-app>



--
You received this message because you are subscribed to the Google Groups "Google Web Toolkit" group.
To unsubscribe from this group and stop receiving emails from it, send an email to google-web-toolkit+unsubscribe@googlegroups.com.
To post to this group, send email to google-web-toolkit@googlegroups.com.
Visit this group at http://groups.google.com/group/google-web-toolkit.
For more options, visit https://groups.google.com/d/optout.

No comments:

Post a Comment